summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans
diff options
context:
space:
mode:
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java3
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java271
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java49
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java49
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java8
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java8
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java284
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java3
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java5
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java5
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java71
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java10
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java5
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java5
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java3
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java1
39 files changed, 739 insertions, 64 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index 40fb313d..5df1bee8 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -63,6 +63,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* Abstract base class for k-means implementations.
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @apiviz.composedOf KMeansInitialization
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
index 292e74db..84fb877c 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
@@ -49,6 +49,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.6.0
*
* @param <V> Vector type
* @param <M> Model type
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java
index 4e2bf7a8..42ea12b6 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java
@@ -66,6 +66,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* </p>
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @param <V> Vector type
*/
@@ -124,7 +125,7 @@ public class CLARA<V> extends KMedoidsPAM<V> {
WritableIntegerDataStore bestclusters = null;
Random rnd = random.getSingleThreadedRandom();
- FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Random samples.", numsamples, LOG) : null;
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing random samples", numsamples, LOG) : null;
for(int j = 0; j < numsamples; j++) {
DBIDs rids = DBIDUtil.randomSample(ids, sampling, rnd);
// Choose initial medoids
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
index b242b407..697655d8 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
@@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
* Some constants and options shared among kmeans family algorithms.
*
* @author Erich Schubert
+ * @since 0.2
*
* @param <V> Number vector type
* @param <M> Actual model type
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
index c6082176..8885fc46 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
@@ -69,6 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* blocks.
*
* @author Erich Schubert
+ * @since 0.6.0
*
* @apiviz.has KMeansModel
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
index 70b6a30e..92e00e0d 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
@@ -61,6 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* </p>
*
* @author Stephan Baier
+ * @since 0.6.0
*
* @param <V> Vector type
* @param <M> Model type
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java
new file mode 100644
index 00000000..5632621a
--- /dev/null
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java
@@ -0,0 +1,271 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2015
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+
+/**
+ * Compare-Means: Accelerated k-means by exploiting the triangle inequality and
+ * pairwise distances of means to prune candidate means.
+ *
+ * Reference:
+ * <p>
+ * S. J. Phillips<br />
+ * Acceleration of k-means and related clustering algorithms<br />
+ * Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX
+ * 2002)
+ * </p>
+ *
+ * @author Erich Schubert
+ * @since 0.5.0
+ *
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ */
+@Title("Compare-Means")
+@Reference(authors = "S. J. Phillips", //
+title = "Acceleration of k-means and related clustering algorithms", //
+booktitle = "Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX 2002)", //
+url = "http://dx.doi.org/10.1007/3-540-45643-0_13")
+public class KMeansCompare<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansCompare.class);
+
+ /**
+ * Key for statistics logging.
+ */
+ private static final String KEY = KMeansCompare.class.getName();
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ */
+ public KMeansCompare(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ @Override
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ if(LOG.isStatistics()) {
+ LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
+ }
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
+
+ // Cluster distances
+ double[][] cdist = new double[k][k];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
+ LongStatistic diststat = LOG.isStatistics() ? new LongStatistic(KEY + ".distance-computations") : null;
+ int iteration = 0;
+ for(; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ recomputeSeperation(means, cdist, diststat);
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, cdist, diststat);
+ logVarstat(varstat, varsum);
+ if(LOG.isStatistics()) {
+ LOG.statistics(diststat);
+ }
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute means.
+ means = means(clusters, means, relation);
+ }
+ LOG.setCompleted(prog);
+ if(LOG.isStatistics()) {
+ LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
+ }
+
+ // Wrap result
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ if(ids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(ids, model));
+ }
+ return result;
+ }
+
+ /**
+ * Recompute the separation of cluster means.
+ *
+ * @param means Means
+ * @param cdist Center-to-Center distances
+ * @param diststat Distance counting statistic
+ */
+ private void recomputeSeperation(List<Vector> means, double[][] cdist, LongStatistic diststat) {
+ final int k = means.size();
+ for(int i = 1; i < k; i++) {
+ Vector mi = means.get(i);
+ for(int j = 0; j < i; j++) {
+ double d = distanceFunction.distance(mi, means.get(j));
+ cdist[i][j] = d;
+ cdist[j][i] = d;
+ }
+ }
+ if(diststat != null) {
+ diststat.increment((k * (k - 1)) >> 1);
+ }
+ }
+
+ /**
+ * Reassign objects, but only if their bounds indicate it is necessary to do
+ * so.
+ *
+ * @param relation Data
+ * @param means Current means
+ * @param clusters Current clusters
+ * @param assignment Cluster assignment
+ * @param varsum Variance sum counter
+ * @param cdist Centroid distances
+ * @param diststat Distance statistics
+ * @return true when the object was reassigned
+ */
+ private boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum, double[][] cdist, LongStatistic diststat) {
+ assert (k == means.size());
+ long dists = 0;
+ boolean changed = false;
+ // Reset all clusters
+ Arrays.fill(varsum, 0.);
+ for(ModifiableDBIDs cluster : clusters) {
+ cluster.clear();
+ }
+ final NumberVectorDistanceFunction<?> df = getDistanceFunction();
+ double mult = (df instanceof SquaredEuclideanDistanceFunction) ? 4 : 2;
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ final int cur = assignment.intValue(iditer), ini = cur >= 0 ? cur : 0;
+ // Distance to current mean:
+ V fv = relation.get(iditer);
+ double mindist = df.distance(fv, means.get(ini));
+ ++dists;
+ final double thresh = mult * mindist;
+ int minIndex = ini;
+ for(int i = 0; i < k; i++) {
+ if(i == ini || cdist[minIndex][i] >= thresh) { // Compare pruning
+ continue;
+ }
+ double dist = df.distance(fv, means.get(i));
+ ++dists;
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ varsum[minIndex] += mindist;
+ clusters.get(minIndex).add(iditer);
+ changed |= assignment.putInt(iditer, minIndex) != minIndex;
+ }
+ // Increment distance computations counter.
+ if(diststat != null) {
+ diststat.increment(dists);
+ }
+ return changed;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected void getParameterDistanceFunction(Parameterization config) {
+ super.getParameterDistanceFunction(config);
+ if(distanceFunction instanceof SquaredEuclideanDistanceFunction) {
+ return; // Proper choice.
+ }
+ if(distanceFunction != null && !distanceFunction.isMetric()) {
+ LOG.warning("Compare k-means requires a metric distance, and k-means should only be used with squared Euclidean distance!");
+ }
+ }
+
+ @Override
+ protected KMeansCompare<V> makeInstance() {
+ return new KMeansCompare<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java
index e2dcf88c..58015327 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java
@@ -47,11 +47,14 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
/**
* Elkan's fast k-means by exploiting the triangle inequality.
@@ -69,6 +72,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
* </p>
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.has KMeansModel
*
@@ -90,15 +94,22 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
private static final String KEY = KMeansElkan.class.getName();
/**
+ * Flag whether to compute the final variance statistic.
+ */
+ private boolean varstat = false;
+
+ /**
* Constructor.
*
* @param distanceFunction distance function
* @param k k parameter
* @param maxiter Maxiter parameter
* @param initializer Initialization method
+ * @param varstat Compute the variance statistic
*/
- public KMeansElkan(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ public KMeansElkan(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer, boolean varstat) {
super(distanceFunction, k, maxiter, initializer);
+ this.varstat = varstat;
}
@Override
@@ -108,7 +119,7 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
}
// Choose initial means
if(LOG.isStatistics()) {
- LOG.statistics(new StringStatistic(KEY + ".initializer", initializer.toString()));
+ LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
// Setup cluster assignment store
@@ -136,7 +147,7 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
double[][] cdist = new double[k][k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
- LongStatistic varstat = LOG.isStatistics() ? new LongStatistic(this.getClass().getName() + ".reassignments") : null;
+ LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(this.getClass().getName() + ".reassignments") : null;
int iteration = 0;
for(; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
@@ -148,9 +159,9 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
recomputeSeperation(means, sep, cdist); // #1
changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, cdist, upper, lower);
}
- if(varstat != null) {
- varstat.setLong(changed);
- LOG.statistics(varstat);
+ if(rstat != null) {
+ rstat.setLong(changed);
+ LOG.statistics(rstat);
}
// Stop if no cluster assignment changed.
if(changed == 0) {
@@ -178,6 +189,7 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
lower.destroy();
// Wrap result
+ double totalvariance = 0.;
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for(int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
@@ -189,9 +201,13 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
varsum += distanceFunction.distance(mean, relation.get(it));
}
+ totalvariance += varsum;
KMeansModel model = new KMeansModel(mean, varsum);
result.addToplevelCluster(new Cluster<>(ids, model));
}
+ if(LOG.isStatistics() && varstat) {
+ LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance));
+ }
return result;
}
@@ -395,6 +411,16 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
* @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ /**
+ * Flag to compute the final clustering variance statistic.
+ */
+ public static final OptionID VARSTAT_ID = new OptionID("kmeans.varstat", "Compute the final clustering variance statistic. Needs an additional full pass over the data set.");
+
+ /**
+ * Compute the final variance statisic.
+ */
+ protected boolean varstat = false;
+
@Override
protected Logging getLogger() {
return LOG;
@@ -412,8 +438,17 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean
}
@Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ Flag varF = new Flag(VARSTAT_ID);
+ if(config.grab(varF)) {
+ varstat = varF.isTrue();
+ }
+ }
+
+ @Override
protected KMeansElkan<V> makeInstance() {
- return new KMeansElkan<>(distanceFunction, k, maxiter, initializer);
+ return new KMeansElkan<>(distanceFunction, k, maxiter, initializer, varstat);
}
}
}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java
index 1b114701..06228525 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java
@@ -46,11 +46,14 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
/**
* Hamerly's fast k-means by exploiting the triangle inequality.
@@ -63,6 +66,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
* </p>
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.has KMeansModel
*
@@ -84,15 +88,22 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
private static final String KEY = KMeansHamerly.class.getName();
/**
+ * Flag whether to compute the final variance statistic.
+ */
+ private boolean varstat = false;
+
+ /**
* Constructor.
*
* @param distanceFunction distance function
* @param k k parameter
* @param maxiter Maxiter parameter
* @param initializer Initialization method
+ * @param varstat Compute the variance statistic
*/
- public KMeansHamerly(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ public KMeansHamerly(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer, boolean varstat) {
super(distanceFunction, k, maxiter, initializer);
+ this.varstat = varstat;
}
@Override
@@ -102,7 +113,7 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
}
// Choose initial means
if(LOG.isStatistics()) {
- LOG.statistics(new StringStatistic(KEY + ".initializer", initializer.toString()));
+ LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
// Setup cluster assignment store
@@ -124,7 +135,7 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
double[] sep = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
- LongStatistic varstat = LOG.isStatistics() ? new LongStatistic(KEY + ".reassignments") : null;
+ LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(KEY + ".reassignments") : null;
int iteration = 0;
for(; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
@@ -136,9 +147,9 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
recomputeSeperation(means, sep);
changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, upper, lower);
}
- if(varstat != null) {
- varstat.setLong(changed);
- LOG.statistics(varstat);
+ if(rstat != null) {
+ rstat.setLong(changed);
+ LOG.statistics(rstat);
}
// Stop if no cluster assignment changed.
if(changed == 0) {
@@ -167,6 +178,7 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
lower.destroy();
// Wrap result
+ double totalvariance = 0.;
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for(int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
@@ -178,9 +190,13 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
varsum += distanceFunction.distance(mean, relation.get(it));
}
+ totalvariance += varsum;
KMeansModel model = new KMeansModel(mean, varsum);
result.addToplevelCluster(new Cluster<>(ids, model));
}
+ if(LOG.isStatistics() && varstat) {
+ LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance));
+ }
return result;
}
@@ -389,6 +405,16 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
* @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ /**
+ * Flag to compute the final clustering variance statistic.
+ */
+ public static final OptionID VARSTAT_ID = new OptionID("kmeans.varstat", "Compute the final clustering variance statistic. Needs an additional full pass over the data set.");
+
+ /**
+ * Compute the final variance statisic.
+ */
+ protected boolean varstat = false;
+
@Override
protected Logging getLogger() {
return LOG;
@@ -406,8 +432,17 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe
}
@Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ Flag varF = new Flag(VARSTAT_ID);
+ if(config.grab(varF)) {
+ varstat = varF.isTrue();
+ }
+ }
+
+ @Override
protected KMeansHamerly<V> makeInstance() {
- return new KMeansHamerly<>(distanceFunction, k, maxiter, initializer);
+ return new KMeansHamerly<>(distanceFunction, k, maxiter, initializer, varstat);
}
}
}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
index cd025b03..476f95f2 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
@@ -52,6 +52,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
* processing and Lloyd-Style batch steps.
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @apiviz.has KMeansModel
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index d8b69f54..c32f22c1 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -46,6 +46,7 @@ import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -53,15 +54,16 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
/**
* The standard k-means algorithm, using Lloyd-style bulk iterations.
*
+ * Reference:
* <p>
- * Reference:<br />
- * S. Lloyd<br/>
+ * S. Lloyd:<br/>
* Least squares quantization in PCM<br/>
* IEEE Transactions on Information Theory 28 (2)<br/>
* previously published as Bell Telephone Laboratories Paper
* </p>
*
* @author Arthur Zimek
+ * @since 0.5.0
*
* @apiviz.landmark
* @apiviz.has KMeansModel
@@ -74,6 +76,8 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
title = "Least squares quantization in PCM", //
booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", //
url = "http://dx.doi.org/10.1109/TIT.1982.1056489")
+@Alias({ "de.lmu.ifi.dbs.elki.algorithm.clustering.KMeans", //
+"de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans" })
public class KMeansLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
/**
* The logger for this class.
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index 176fa311..4f22497d 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -58,14 +58,16 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
* convergence, although MacQueen likely only meant to do a single pass over the
* data.
*
+ * Reference:
* <p>
- * Reference:<br />
- * J. MacQueen: Some Methods for Classification and Analysis of Multivariate
- * Observations. <br />
+ * J. MacQueen:<br />
+ * Some Methods for Classification and Analysis of Multivariate Observations.
+ * <br />
* In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297.
* </p>
*
* @author Erich Schubert
+ * @since 0.5.0
* @apiviz.has KMeansModel
*
* @param <V> vector type to use
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java
new file mode 100644
index 00000000..c119c4ec
--- /dev/null
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java
@@ -0,0 +1,284 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2015
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.DoubleIntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+
+/**
+ * Sort-Means: Accelerated k-means by exploiting the triangle inequality and
+ * pairwise distances of means to prune candidate means (with sorting).
+ *
+ * Reference:
+ * <p>
+ * S. J. Phillips<br />
+ * Acceleration of k-means and related clustering algorithms<br />
+ * Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX
+ * 2002)
+ * </p>
+ *
+ * @author Erich Schubert
+ * @since 0.7.1
+ *
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ */
+@Title("Sort-Means")
+@Reference(authors = "S. J. Phillips", //
+title = "Acceleration of k-means and related clustering algorithms", //
+booktitle = "Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX 2002)", //
+url = "http://dx.doi.org/10.1007/3-540-45643-0_13")
+public class KMeansSort<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansSort.class);
+
+ /**
+ * Key for statistics logging.
+ */
+ private static final String KEY = KMeansSort.class.getName();
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ */
+ public KMeansSort(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ @Override
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ if(LOG.isStatistics()) {
+ LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
+ }
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
+
+ // Cluster distances
+ double[][] cdist = new double[k][k];
+ int[][] cnum = new int[k][k - 1];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
+ LongStatistic diststat = LOG.isStatistics() ? new LongStatistic(KEY + ".distance-computations") : null;
+ int iteration = 0;
+ for(; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ recomputeSeperation(means, cdist, cnum, diststat);
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, cdist, cnum, diststat);
+ logVarstat(varstat, varsum);
+ if(LOG.isStatistics()) {
+ LOG.statistics(diststat);
+ }
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute means.
+ means = means(clusters, means, relation);
+ }
+ LOG.setCompleted(prog);
+ if(LOG.isStatistics()) {
+ LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
+ }
+
+ // Wrap result
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ if(ids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(ids, model));
+ }
+ return result;
+ }
+
+ /**
+ * Recompute the separation of cluster means.
+ *
+ * @param means Means
+ * @param cdist Center-to-Center distances
+ * @param cnum Center numbers
+ * @param diststat Distance counting statistic
+ */
+ private void recomputeSeperation(List<Vector> means, double[][] cdist, int[][] cnum, LongStatistic diststat) {
+ final int k = means.size();
+ for(int i = 1; i < k; i++) {
+ Vector mi = means.get(i);
+ for(int j = 0; j < i; j++) {
+ double d = distanceFunction.distance(mi, means.get(j));
+ cdist[i][j] = d;
+ cdist[j][i] = d;
+ }
+ }
+ double[] buf = new double[k - 1];
+ for(int i = 0; i < k; i++) {
+ System.arraycopy(cdist[i], 0, buf, 0, i);
+ System.arraycopy(cdist[i], i + 1, buf, i, k - i - 1);
+ for(int j = 0; j < buf.length; j++) {
+ cnum[i][j] = j < i ? j : (j + 1);
+ }
+ DoubleIntegerArrayQuickSort.sort(buf, cnum[i], k - 1);
+ }
+ if(diststat != null) {
+ diststat.increment((k * (k - 1)) >> 1);
+ }
+ }
+
+ /**
+ * Reassign objects, but only if their bounds indicate it is necessary to do
+ * so.
+ *
+ * @param relation Data
+ * @param means Current means
+ * @param clusters Current clusters
+ * @param assignment Cluster assignment
+ * @param varsum Variance sum counter
+ * @param cdist Centroid distances
+ * @param cnum Centroid nearest neighbors
+ * @param diststat Distance statistics
+ * @return true when the object was reassigned
+ */
+ private boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum, double[][] cdist, int[][] cnum, LongStatistic diststat) {
+ assert (k == means.size());
+ long dists = 0;
+ boolean changed = false;
+ // Reset all clusters
+ Arrays.fill(varsum, 0.);
+ for(ModifiableDBIDs cluster : clusters) {
+ cluster.clear();
+ }
+ final NumberVectorDistanceFunction<?> df = getDistanceFunction();
+ double mult = (df instanceof SquaredEuclideanDistanceFunction) ? 4 : 2;
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ final int cur = assignment.intValue(iditer), ini = cur >= 0 ? cur : 0;
+ // Distance to current mean:
+ V fv = relation.get(iditer);
+ double mindist = df.distance(fv, means.get(ini));
+ ++dists;
+ final double threshold = mult * mindist;
+ int minIndex = ini;
+ for(int i : cnum[ini]) {
+ if(cdist[minIndex][i] >= threshold) { // Sort pruning
+ break; // All following can only be worse.
+ }
+ double dist = df.distance(fv, means.get(i));
+ ++dists;
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ varsum[minIndex] += mindist;
+ clusters.get(minIndex).add(iditer);
+ changed |= assignment.putInt(iditer, minIndex) != minIndex;
+ }
+ // Increment distance computations counter.
+ if(diststat != null) {
+ diststat.increment(dists);
+ }
+ return changed;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected void getParameterDistanceFunction(Parameterization config) {
+ super.getParameterDistanceFunction(config);
+ if(distanceFunction instanceof SquaredEuclideanDistanceFunction) {
+ return; // Proper choice.
+ }
+ if(distanceFunction != null && !distanceFunction.isMetric()) {
+ LOG.warning("Compare k-means requires a metric distance, and k-means should only be used with squared Euclidean distance!");
+ }
+ }
+
+ @Override
+ protected KMeansSort<V> makeInstance() {
+ return new KMeansSort<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
index 6d8b5874..7e4b4524 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -60,6 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
* </p>
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @apiviz.has MeanModel
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
index 567d6753..cc06a4d5 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -72,6 +72,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* (this variation is likely not worth publishing on its own).
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @apiviz.has MedoidModel
* @apiviz.composedOf KMedoidsInitialization
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
index fc3bdd5f..4d45eb3d 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -77,6 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* </p>
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @apiviz.has MedoidModel
* @apiviz.composedOf KMedoidsInitialization
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java
index 5fa88a33..b4413904 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java
@@ -48,6 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
* Pseudo-k-Means variations, that assigns each object to the nearest center.
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @apiviz.has KMeansModel
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
index d354327b..75a87daa 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
@@ -77,6 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @author Tibor Goldschwendt
* @author Erich Schubert
+ * @since 0.7.0
*
* @param <V> Vector type
* @param <M> Model type
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java
index 723c86cc..72f9a887 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java
@@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* Abstract base class for common k-means initializations.
*
* @author Erich Schubert
+ * @since 0.3
*
* @param <V> Vector type
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java
index 4fa0d004..a686f345 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java
@@ -40,6 +40,7 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
@@ -52,9 +53,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
* times will be more likely to return the same local minima.
*
* @author Erich Schubert
+ * @since 0.6.0
*
* @param <O> Object type for kMedoids and kMedians
*/
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.FarthestPointsInitialMeans")
public class FarthestPointsInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
/**
* Discard the first vector.
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java
index 9cceb4f2..9f223dac 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java
@@ -49,6 +49,7 @@ import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
* times will be more likely to return the same local minima.
*
* @author Erich Schubert
+ * @since 0.6.0
*
* @param <O> Object type for kmedoids and kmedians
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java
index beab423e..210994e5 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java
@@ -34,15 +34,18 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
* Initialize K-means by using the first k objects as initial means.
*
* @author Erich Schubert
+ * @since 0.4.0
*
* @param <O> Object type for KMedoids
*/
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.FirstKInitialMeans")
public class FirstKInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> {
/**
* Constructor.
@@ -84,4 +87,4 @@ public class FirstKInitialMeans<O> implements KMeansInitialization<NumberVector>
return new FirstKInitialMeans<>();
}
}
-} \ No newline at end of file
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java
index 55ffb65e..c1cd71a0 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java
@@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
* Interface for initializing K-Means
*
* @author Erich Schubert
+ * @since 0.2
*
* @apiviz.landmark
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java
index c1518d18..11bfef59 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java
@@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -56,6 +57,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
* </p>
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @param <O> Vector type
*/
@@ -63,6 +65,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
title = "k-means++: the advantages of careful seeding", //
booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", //
url = "http://dx.doi.org/10.1145/1283383.1283494")
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansPlusPlusInitialMeans")
public class KMeansPlusPlusInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
/**
* Constructor.
@@ -232,4 +235,4 @@ public class KMeansPlusPlusInitialMeans<O> extends AbstractKMeansInitialization<
return new KMeansPlusPlusInitialMeans<>(rnd);
}
}
-} \ No newline at end of file
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java
index 0ae3723e..2dc11c52 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java
@@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
* this initialization will only return members of the original data set.
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @param <V> Object type
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java
index 396d79e6..ea11330e 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java
@@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -57,12 +58,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
* </p>
*
* @author Erich Schubert
+ * @since 0.5.0
*
* @param <O> Object type for KMedoids initialization
*/
@Reference(title = "Clustering my means of Medoids", //
authors = "Kaufman, L. and Rousseeuw, P.J.", //
booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.PAMInitialMeans")
public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> {
/**
* Class logger.
@@ -78,6 +81,9 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K
@Override
public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, NumberVectorDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ if(relation.size() < k) {
+ throw new AbortException("Database has less than k objects.");
+ }
// Ugly cast; but better than code duplication.
@SuppressWarnings("unchecked")
Relation<O> rel = (Relation<O>) relation;
@@ -97,38 +103,33 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K
public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
DBIDVar bestid = DBIDUtil.newVar();
- WritableDoubleDataStore mindist = null;
+ // We need three temporary storage arrays:
+ WritableDoubleDataStore mindist, bestd, tempd;
+ mindist = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ bestd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ tempd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
// First mean is chosen by having the smallest distance sum to all others.
{
double best = Double.POSITIVE_INFINITY;
- WritableDoubleDataStore newd = null;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial mean", ids.size(), LOG) : null;
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- if(newd == null) {
- newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
- }
- int sum = 0;
+ double sum = 0, d;
for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
- double d = distQ.distance(iter, iter2);
- sum += d;
- newd.putDouble(iter2, d);
+ sum += d = distQ.distance(iter, iter2);
+ tempd.putDouble(iter2, d);
}
if(sum < best) {
best = sum;
bestid.set(iter);
- if(mindist != null) {
- mindist.destroy();
- }
- mindist = newd;
- newd = null;
+ // Swap mindist and newd:
+ WritableDoubleDataStore temp = mindist;
+ mindist = tempd;
+ tempd = temp;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
- if(newd != null) {
- newd.destroy();
- }
medids.add(bestid);
}
assert(mindist != null);
@@ -138,44 +139,40 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K
LOG.incrementProcessed(prog); // First one was just chosen.
for(int i = 1; i < k; i++) {
double best = Double.POSITIVE_INFINITY;
- WritableDoubleDataStore bestd = null, newd = null;
+ bestid.unset();
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
if(medids.contains(iter)) {
continue;
}
- if(newd == null) {
- newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
- }
- double sum = 0.;
+ double sum = 0., v;
for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
- double v = MathUtil.min(distQ.distance(iter, iter2), mindist.doubleValue(iter2));
- sum += v;
- newd.put(iter2, v);
+ sum += v = MathUtil.min(distQ.distance(iter, iter2), mindist.doubleValue(iter2));
+ tempd.put(iter2, v);
}
if(sum < best) {
best = sum;
bestid.set(iter);
- if(bestd != null) {
- bestd.destroy();
- }
- bestd = newd;
- newd = null;
+ // Swap bestd and newd:
+ WritableDoubleDataStore temp = bestd;
+ bestd = tempd;
+ tempd = temp;
}
}
- if(bestd == null) {
+ if(!bestid.isSet()) {
throw new AbortException("No median found that improves the criterion function?!? Too many infinite distances.");
}
medids.add(bestid);
- if(newd != null) {
- newd.destroy();
- }
- mindist.destroy();
- mindist = bestd;
+ // Swap bestd and mindist:
+ WritableDoubleDataStore temp = bestd;
+ bestd = mindist;
+ mindist = temp;
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
mindist.destroy();
+ bestd.destroy();
+ tempd.destroy();
return medids;
}
@@ -192,4 +189,4 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K
return new PAMInitialMeans<>();
}
}
-} \ No newline at end of file
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java
index ca8a612a..b01651b8 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java
@@ -38,12 +38,13 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.VectorListParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleArrayListParameter;
/**
* Run k-means with prespecified initial means.
*
* @author Erich Schubert
+ * @since 0.7.0
*/
public class PredefinedInitialMeans extends AbstractKMeansInitialization<NumberVector> {
/**
@@ -147,9 +148,12 @@ public class PredefinedInitialMeans extends AbstractKMeansInitialization<NumberV
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- VectorListParameter meansP = new VectorListParameter(INITIAL_MEANS);
+ DoubleArrayListParameter meansP = new DoubleArrayListParameter(INITIAL_MEANS);
if(config.grab(meansP)) {
- initialMeans = meansP.getValue();
+ initialMeans = new ArrayList<>(meansP.getValue().size());
+ for(double[] v : meansP.getValue()) {
+ initialMeans.add(new Vector(v));
+ }
}
}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java
index b6cfad8d..bb03b278 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java
@@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
/**
@@ -51,12 +52,14 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
* available in Biometrics).
*
* @author Erich Schubert
+ * @since 0.4.0
*
* @param <O> Vector type
*/
@Reference(authors = "E. W. Forgy", //
title = "Cluster analysis of multivariate data: efficiency versus interpretability of classifications", //
booktitle = "Biometrics 21(3)")
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyChosenInitialMeans")
public class RandomlyChosenInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
/**
* Constructor.
@@ -95,4 +98,4 @@ public class RandomlyChosenInitialMeans<O> extends AbstractKMeansInitialization<
return new RandomlyChosenInitialMeans<>(rnd);
}
}
-} \ No newline at end of file
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java
index bc2181e8..30d1b00f 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java
@@ -33,13 +33,16 @@ import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
* Initialize k-means by generating random vectors (within the data sets value
* range).
*
* @author Erich Schubert
+ * @since 0.5.0
*/
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyGeneratedInitialMeans")
public class RandomlyGeneratedInitialMeans extends AbstractKMeansInitialization<NumberVector> {
/**
* Constructor.
@@ -84,4 +87,4 @@ public class RandomlyGeneratedInitialMeans extends AbstractKMeansInitialization<
return new RandomlyGeneratedInitialMeans(rnd);
}
}
-} \ No newline at end of file
+}
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java
index ac7f7f27..f5ed00f1 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java
@@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
@@ -53,9 +54,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* Initialize k-means by running k-means on a sample of the data set only.
*
* @author Erich Schubert
+ * @since 0.6.0
*
* @param <V> Vector type
*/
+@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.SampleKMeansInitialization")
public class SampleKMeansInitialization<V extends NumberVector> extends AbstractKMeansInitialization<V> {
/**
* Variant of kMeans to use for initialization.
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java
index c0e34f12..f2b9db1f 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java
@@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.parallel.processor.Processor;
* Parallel k-means implementation.
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.has Instance
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java
index a10d45ae..e214581a 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java
@@ -51,6 +51,7 @@ import de.lmu.ifi.dbs.elki.parallel.ParallelExecutor;
* Parallel implementation of k-Means clustering.
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.has KMeansProcessor
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java
index 721b74b6..cc4c00ea 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java
@@ -65,6 +65,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
*
* @author Tibor Goldschwendt
* @author Erich Schubert
+ * @since 0.7.0
*/
@Reference(authors = "D. Pelleg, A. Moore", //
booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", //
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java
index e1e72b51..aafd6173 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java
@@ -51,6 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
*
* @author Tibor Goldschwendt
* @author Erich Schubert
+ * @since 0.2
*/
@Reference(authors = "H. Akaike", //
title = "On entropy maximization principle", //
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java
index 8b21753d..22ad56d9 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java
@@ -52,6 +52,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
*
* @author Tibor Goldschwendt
* @author Erich Schubert
+ * @since 0.6.0
*/
@Reference(authors = "G. Schwarz", //
title = "Estimating the dimension of a model", //
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java
index 755d7137..6a18269a 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java
@@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
*
* @author Tibor Goldschwendt
* @author Erich Schubert
+ * @since 0.2
*/
@Reference(authors = "Q. Zhao, M. Xu, P. Fränti", //
title = "Knee Point Detection on Bayesian Information Criterion", //
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
index bd0f12a1..100c2dfc 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
* Important note: some measures are ascending, others are descending!
*
* @author Erich Schubert
+ * @since 0.2
*
* @param <O> Input Object restriction type
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
index cd7a4d14..e7069ac0 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
@@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
* The average of all average pairwise distances in a cluster.
*
* @author Stephan Baier
+ * @since 0.6.0
*/
public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector> {
@Override
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
index 09017413..fbd04dbd 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.6.0
*/
public class WithinClusterVarianceQualityMeasure extends AbstractKMeansQualityMeasure<NumberVector> {
@Override