diff options
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
39 files changed, 739 insertions, 64 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index 40fb313d..5df1bee8 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -63,6 +63,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * Abstract base class for k-means implementations. * * @author Erich Schubert + * @since 0.5.0 * * @apiviz.composedOf KMeansInitialization * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java index 292e74db..84fb877c 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java @@ -49,6 +49,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @author Stephan Baier * @author Erich Schubert + * @since 0.6.0 * * @param <V> Vector type * @param <M> Model type diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java index 4e2bf7a8..42ea12b6 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java @@ -66,6 +66,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * </p> * * @author Erich Schubert + * @since 0.7.0 * * @param <V> Vector type */ @@ -124,7 +125,7 @@ public class CLARA<V> extends KMedoidsPAM<V> { WritableIntegerDataStore bestclusters = null; Random rnd = random.getSingleThreadedRandom(); - FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Random samples.", numsamples, LOG) : null; + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing random samples", numsamples, LOG) : null; for(int j = 0; j < numsamples; j++) { DBIDs rids = DBIDUtil.randomSample(ids, sampling, rnd); // Choose initial medoids diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java index b242b407..697655d8 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; * Some constants and options shared among kmeans family algorithms. * * @author Erich Schubert + * @since 0.2 * * @param <V> Number vector type * @param <M> Actual model type diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java index c6082176..8885fc46 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java @@ -69,6 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * blocks. * * @author Erich Schubert + * @since 0.6.0 * * @apiviz.has KMeansModel * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java index 70b6a30e..92e00e0d 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java @@ -61,6 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * </p> * * @author Stephan Baier + * @since 0.6.0 * * @param <V> Vector type * @param <M> Model type diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java new file mode 100644 index 00000000..5632621a --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansCompare.java @@ -0,0 +1,271 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; + +/** + * Compare-Means: Accelerated k-means by exploiting the triangle inequality and + * pairwise distances of means to prune candidate means. + * + * Reference: + * <p> + * S. J. Phillips<br /> + * Acceleration of k-means and related clustering algorithms<br /> + * Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX + * 2002) + * </p> + * + * @author Erich Schubert + * @since 0.5.0 + * + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + */ +@Title("Compare-Means") +@Reference(authors = "S. J. Phillips", // +title = "Acceleration of k-means and related clustering algorithms", // +booktitle = "Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX 2002)", // +url = "http://dx.doi.org/10.1007/3-540-45643-0_13") +public class KMeansCompare<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansCompare.class); + + /** + * Key for statistics logging. + */ + private static final String KEY = KMeansCompare.class.getName(); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Initialization method + */ + public KMeansCompare(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + @Override + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + if(LOG.isStatistics()) { + LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); + } + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; + + // Cluster distances + double[][] cdist = new double[k][k]; + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; + LongStatistic diststat = LOG.isStatistics() ? new LongStatistic(KEY + ".distance-computations") : null; + int iteration = 0; + for(; maxiter <= 0 || iteration < maxiter; iteration++) { + LOG.incrementProcessed(prog); + recomputeSeperation(means, cdist, diststat); + boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, cdist, diststat); + logVarstat(varstat, varsum); + if(LOG.isStatistics()) { + LOG.statistics(diststat); + } + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute means. + means = means(clusters, means, relation); + } + LOG.setCompleted(prog); + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); + } + + // Wrap result + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + if(ids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(ids, model)); + } + return result; + } + + /** + * Recompute the separation of cluster means. + * + * @param means Means + * @param cdist Center-to-Center distances + * @param diststat Distance counting statistic + */ + private void recomputeSeperation(List<Vector> means, double[][] cdist, LongStatistic diststat) { + final int k = means.size(); + for(int i = 1; i < k; i++) { + Vector mi = means.get(i); + for(int j = 0; j < i; j++) { + double d = distanceFunction.distance(mi, means.get(j)); + cdist[i][j] = d; + cdist[j][i] = d; + } + } + if(diststat != null) { + diststat.increment((k * (k - 1)) >> 1); + } + } + + /** + * Reassign objects, but only if their bounds indicate it is necessary to do + * so. + * + * @param relation Data + * @param means Current means + * @param clusters Current clusters + * @param assignment Cluster assignment + * @param varsum Variance sum counter + * @param cdist Centroid distances + * @param diststat Distance statistics + * @return true when the object was reassigned + */ + private boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum, double[][] cdist, LongStatistic diststat) { + assert (k == means.size()); + long dists = 0; + boolean changed = false; + // Reset all clusters + Arrays.fill(varsum, 0.); + for(ModifiableDBIDs cluster : clusters) { + cluster.clear(); + } + final NumberVectorDistanceFunction<?> df = getDistanceFunction(); + double mult = (df instanceof SquaredEuclideanDistanceFunction) ? 4 : 2; + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final int cur = assignment.intValue(iditer), ini = cur >= 0 ? cur : 0; + // Distance to current mean: + V fv = relation.get(iditer); + double mindist = df.distance(fv, means.get(ini)); + ++dists; + final double thresh = mult * mindist; + int minIndex = ini; + for(int i = 0; i < k; i++) { + if(i == ini || cdist[minIndex][i] >= thresh) { // Compare pruning + continue; + } + double dist = df.distance(fv, means.get(i)); + ++dists; + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + varsum[minIndex] += mindist; + clusters.get(minIndex).add(iditer); + changed |= assignment.putInt(iditer, minIndex) != minIndex; + } + // Increment distance computations counter. + if(diststat != null) { + diststat.increment(dists); + } + return changed; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected void getParameterDistanceFunction(Parameterization config) { + super.getParameterDistanceFunction(config); + if(distanceFunction instanceof SquaredEuclideanDistanceFunction) { + return; // Proper choice. + } + if(distanceFunction != null && !distanceFunction.isMetric()) { + LOG.warning("Compare k-means requires a metric distance, and k-means should only be used with squared Euclidean distance!"); + } + } + + @Override + protected KMeansCompare<V> makeInstance() { + return new KMeansCompare<>(distanceFunction, k, maxiter, initializer); + } + } +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java index e2dcf88c..58015327 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansElkan.java @@ -47,11 +47,14 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; /** * Elkan's fast k-means by exploiting the triangle inequality. @@ -69,6 +72,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz * </p> * * @author Erich Schubert + * @since 0.7.0 * * @apiviz.has KMeansModel * @@ -90,15 +94,22 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean private static final String KEY = KMeansElkan.class.getName(); /** + * Flag whether to compute the final variance statistic. + */ + private boolean varstat = false; + + /** * Constructor. * * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter * @param initializer Initialization method + * @param varstat Compute the variance statistic */ - public KMeansElkan(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { + public KMeansElkan(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer, boolean varstat) { super(distanceFunction, k, maxiter, initializer); + this.varstat = varstat; } @Override @@ -108,7 +119,7 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean } // Choose initial means if(LOG.isStatistics()) { - LOG.statistics(new StringStatistic(KEY + ".initializer", initializer.toString())); + LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); // Setup cluster assignment store @@ -136,7 +147,7 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean double[][] cdist = new double[k][k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; - LongStatistic varstat = LOG.isStatistics() ? new LongStatistic(this.getClass().getName() + ".reassignments") : null; + LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(this.getClass().getName() + ".reassignments") : null; int iteration = 0; for(; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); @@ -148,9 +159,9 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean recomputeSeperation(means, sep, cdist); // #1 changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, cdist, upper, lower); } - if(varstat != null) { - varstat.setLong(changed); - LOG.statistics(varstat); + if(rstat != null) { + rstat.setLong(changed); + LOG.statistics(rstat); } // Stop if no cluster assignment changed. if(changed == 0) { @@ -178,6 +189,7 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean lower.destroy(); // Wrap result + double totalvariance = 0.; Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for(int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); @@ -189,9 +201,13 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { varsum += distanceFunction.distance(mean, relation.get(it)); } + totalvariance += varsum; KMeansModel model = new KMeansModel(mean, varsum); result.addToplevelCluster(new Cluster<>(ids, model)); } + if(LOG.isStatistics() && varstat) { + LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance)); + } return result; } @@ -395,6 +411,16 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { + /** + * Flag to compute the final clustering variance statistic. + */ + public static final OptionID VARSTAT_ID = new OptionID("kmeans.varstat", "Compute the final clustering variance statistic. Needs an additional full pass over the data set."); + + /** + * Compute the final variance statisic. + */ + protected boolean varstat = false; + @Override protected Logging getLogger() { return LOG; @@ -412,8 +438,17 @@ public class KMeansElkan<V extends NumberVector> extends AbstractKMeans<V, KMean } @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + Flag varF = new Flag(VARSTAT_ID); + if(config.grab(varF)) { + varstat = varF.isTrue(); + } + } + + @Override protected KMeansElkan<V> makeInstance() { - return new KMeansElkan<>(distanceFunction, k, maxiter, initializer); + return new KMeansElkan<>(distanceFunction, k, maxiter, initializer, varstat); } } } diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java index 1b114701..06228525 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHamerly.java @@ -46,11 +46,14 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; /** * Hamerly's fast k-means by exploiting the triangle inequality. @@ -63,6 +66,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz * </p> * * @author Erich Schubert + * @since 0.7.0 * * @apiviz.has KMeansModel * @@ -84,15 +88,22 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe private static final String KEY = KMeansHamerly.class.getName(); /** + * Flag whether to compute the final variance statistic. + */ + private boolean varstat = false; + + /** * Constructor. * * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter * @param initializer Initialization method + * @param varstat Compute the variance statistic */ - public KMeansHamerly(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { + public KMeansHamerly(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer, boolean varstat) { super(distanceFunction, k, maxiter, initializer); + this.varstat = varstat; } @Override @@ -102,7 +113,7 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe } // Choose initial means if(LOG.isStatistics()) { - LOG.statistics(new StringStatistic(KEY + ".initializer", initializer.toString())); + LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); // Setup cluster assignment store @@ -124,7 +135,7 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe double[] sep = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; - LongStatistic varstat = LOG.isStatistics() ? new LongStatistic(KEY + ".reassignments") : null; + LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(KEY + ".reassignments") : null; int iteration = 0; for(; maxiter <= 0 || iteration < maxiter; iteration++) { LOG.incrementProcessed(prog); @@ -136,9 +147,9 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe recomputeSeperation(means, sep); changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, upper, lower); } - if(varstat != null) { - varstat.setLong(changed); - LOG.statistics(varstat); + if(rstat != null) { + rstat.setLong(changed); + LOG.statistics(rstat); } // Stop if no cluster assignment changed. if(changed == 0) { @@ -167,6 +178,7 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe lower.destroy(); // Wrap result + double totalvariance = 0.; Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for(int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); @@ -178,9 +190,13 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { varsum += distanceFunction.distance(mean, relation.get(it)); } + totalvariance += varsum; KMeansModel model = new KMeansModel(mean, varsum); result.addToplevelCluster(new Cluster<>(ids, model)); } + if(LOG.isStatistics() && varstat) { + LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance)); + } return result; } @@ -389,6 +405,16 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { + /** + * Flag to compute the final clustering variance statistic. + */ + public static final OptionID VARSTAT_ID = new OptionID("kmeans.varstat", "Compute the final clustering variance statistic. Needs an additional full pass over the data set."); + + /** + * Compute the final variance statisic. + */ + protected boolean varstat = false; + @Override protected Logging getLogger() { return LOG; @@ -406,8 +432,17 @@ public class KMeansHamerly<V extends NumberVector> extends AbstractKMeans<V, KMe } @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + Flag varF = new Flag(VARSTAT_ID); + if(config.grab(varF)) { + varstat = varF.isTrue(); + } + } + + @Override protected KMeansHamerly<V> makeInstance() { - return new KMeansHamerly<>(distanceFunction, k, maxiter, initializer); + return new KMeansHamerly<>(distanceFunction, k, maxiter, initializer, varstat); } } } diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java index cd025b03..476f95f2 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java @@ -52,6 +52,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; * processing and Lloyd-Style batch steps. * * @author Erich Schubert + * @since 0.5.0 * * @apiviz.has KMeansModel * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index d8b69f54..c32f22c1 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -46,6 +46,7 @@ import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -53,15 +54,16 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; /** * The standard k-means algorithm, using Lloyd-style bulk iterations. * + * Reference: * <p> - * Reference:<br /> - * S. Lloyd<br/> + * S. Lloyd:<br/> * Least squares quantization in PCM<br/> * IEEE Transactions on Information Theory 28 (2)<br/> * previously published as Bell Telephone Laboratories Paper * </p> * * @author Arthur Zimek + * @since 0.5.0 * * @apiviz.landmark * @apiviz.has KMeansModel @@ -74,6 +76,8 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; title = "Least squares quantization in PCM", // booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", // url = "http://dx.doi.org/10.1109/TIT.1982.1056489") +@Alias({ "de.lmu.ifi.dbs.elki.algorithm.clustering.KMeans", // +"de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans" }) public class KMeansLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { /** * The logger for this class. diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index 176fa311..4f22497d 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -58,14 +58,16 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * convergence, although MacQueen likely only meant to do a single pass over the * data. * + * Reference: * <p> - * Reference:<br /> - * J. MacQueen: Some Methods for Classification and Analysis of Multivariate - * Observations. <br /> + * J. MacQueen:<br /> + * Some Methods for Classification and Analysis of Multivariate Observations. + * <br /> * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297. * </p> * * @author Erich Schubert + * @since 0.5.0 * @apiviz.has KMeansModel * * @param <V> vector type to use diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java new file mode 100644 index 00000000..c119c4ec --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansSort.java @@ -0,0 +1,284 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.DoubleIntegerArrayQuickSort; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; + +/** + * Sort-Means: Accelerated k-means by exploiting the triangle inequality and + * pairwise distances of means to prune candidate means (with sorting). + * + * Reference: + * <p> + * S. J. Phillips<br /> + * Acceleration of k-means and related clustering algorithms<br /> + * Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX + * 2002) + * </p> + * + * @author Erich Schubert + * @since 0.7.1 + * + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + */ +@Title("Sort-Means") +@Reference(authors = "S. J. Phillips", // +title = "Acceleration of k-means and related clustering algorithms", // +booktitle = "Proc. 4th Int. Workshop on Algorithm Engineering and Experiments (ALENEX 2002)", // +url = "http://dx.doi.org/10.1007/3-540-45643-0_13") +public class KMeansSort<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansSort.class); + + /** + * Key for statistics logging. + */ + private static final String KEY = KMeansSort.class.getName(); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Initialization method + */ + public KMeansSort(NumberVectorDistanceFunction<? super V> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + @Override + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + if(LOG.isStatistics()) { + LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); + } + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; + + // Cluster distances + double[][] cdist = new double[k][k]; + int[][] cnum = new int[k][k - 1]; + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; + LongStatistic diststat = LOG.isStatistics() ? new LongStatistic(KEY + ".distance-computations") : null; + int iteration = 0; + for(; maxiter <= 0 || iteration < maxiter; iteration++) { + LOG.incrementProcessed(prog); + recomputeSeperation(means, cdist, cnum, diststat); + boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, cdist, cnum, diststat); + logVarstat(varstat, varsum); + if(LOG.isStatistics()) { + LOG.statistics(diststat); + } + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute means. + means = means(clusters, means, relation); + } + LOG.setCompleted(prog); + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); + } + + // Wrap result + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + if(ids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(ids, model)); + } + return result; + } + + /** + * Recompute the separation of cluster means. + * + * @param means Means + * @param cdist Center-to-Center distances + * @param cnum Center numbers + * @param diststat Distance counting statistic + */ + private void recomputeSeperation(List<Vector> means, double[][] cdist, int[][] cnum, LongStatistic diststat) { + final int k = means.size(); + for(int i = 1; i < k; i++) { + Vector mi = means.get(i); + for(int j = 0; j < i; j++) { + double d = distanceFunction.distance(mi, means.get(j)); + cdist[i][j] = d; + cdist[j][i] = d; + } + } + double[] buf = new double[k - 1]; + for(int i = 0; i < k; i++) { + System.arraycopy(cdist[i], 0, buf, 0, i); + System.arraycopy(cdist[i], i + 1, buf, i, k - i - 1); + for(int j = 0; j < buf.length; j++) { + cnum[i][j] = j < i ? j : (j + 1); + } + DoubleIntegerArrayQuickSort.sort(buf, cnum[i], k - 1); + } + if(diststat != null) { + diststat.increment((k * (k - 1)) >> 1); + } + } + + /** + * Reassign objects, but only if their bounds indicate it is necessary to do + * so. + * + * @param relation Data + * @param means Current means + * @param clusters Current clusters + * @param assignment Cluster assignment + * @param varsum Variance sum counter + * @param cdist Centroid distances + * @param cnum Centroid nearest neighbors + * @param diststat Distance statistics + * @return true when the object was reassigned + */ + private boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum, double[][] cdist, int[][] cnum, LongStatistic diststat) { + assert (k == means.size()); + long dists = 0; + boolean changed = false; + // Reset all clusters + Arrays.fill(varsum, 0.); + for(ModifiableDBIDs cluster : clusters) { + cluster.clear(); + } + final NumberVectorDistanceFunction<?> df = getDistanceFunction(); + double mult = (df instanceof SquaredEuclideanDistanceFunction) ? 4 : 2; + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final int cur = assignment.intValue(iditer), ini = cur >= 0 ? cur : 0; + // Distance to current mean: + V fv = relation.get(iditer); + double mindist = df.distance(fv, means.get(ini)); + ++dists; + final double threshold = mult * mindist; + int minIndex = ini; + for(int i : cnum[ini]) { + if(cdist[minIndex][i] >= threshold) { // Sort pruning + break; // All following can only be worse. + } + double dist = df.distance(fv, means.get(i)); + ++dists; + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + varsum[minIndex] += mindist; + clusters.get(minIndex).add(iditer); + changed |= assignment.putInt(iditer, minIndex) != minIndex; + } + // Increment distance computations counter. + if(diststat != null) { + diststat.increment(dists); + } + return changed; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected void getParameterDistanceFunction(Parameterization config) { + super.getParameterDistanceFunction(config); + if(distanceFunction instanceof SquaredEuclideanDistanceFunction) { + return; // Proper choice. + } + if(distanceFunction != null && !distanceFunction.isMetric()) { + LOG.warning("Compare k-means requires a metric distance, and k-means should only be used with squared Euclidean distance!"); + } + } + + @Override + protected KMeansSort<V> makeInstance() { + return new KMeansSort<>(distanceFunction, k, maxiter, initializer); + } + } +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index 6d8b5874..7e4b4524 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -60,6 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * </p> * * @author Erich Schubert + * @since 0.5.0 * * @apiviz.has MeanModel * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index 567d6753..cc06a4d5 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -72,6 +72,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * (this variation is likely not worth publishing on its own). * * @author Erich Schubert + * @since 0.5.0 * * @apiviz.has MedoidModel * @apiviz.composedOf KMedoidsInitialization diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index fc3bdd5f..4d45eb3d 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -77,6 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * </p> * * @author Erich Schubert + * @since 0.5.0 * * @apiviz.has MedoidModel * @apiviz.composedOf KMedoidsInitialization diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java index 5fa88a33..b4413904 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java @@ -48,6 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz * Pseudo-k-Means variations, that assigns each object to the nearest center. * * @author Erich Schubert + * @since 0.5.0 * * @apiviz.has KMeansModel * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java index d354327b..75a87daa 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java @@ -77,6 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * * @author Tibor Goldschwendt * @author Erich Schubert + * @since 0.7.0 * * @param <V> Vector type * @param <M> Model type diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java index 723c86cc..72f9a887 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * Abstract base class for common k-means initializations. * * @author Erich Schubert + * @since 0.3 * * @param <V> Vector type */ diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java index 4fa0d004..a686f345 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java @@ -40,6 +40,7 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -52,9 +53,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; * times will be more likely to return the same local minima. * * @author Erich Schubert + * @since 0.6.0 * * @param <O> Object type for kMedoids and kMedians */ +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.FarthestPointsInitialMeans") public class FarthestPointsInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> { /** * Discard the first vector. diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java index 9cceb4f2..9f223dac 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java @@ -49,6 +49,7 @@ import de.lmu.ifi.dbs.elki.math.random.RandomFactory; * times will be more likely to return the same local minima. * * @author Erich Schubert + * @since 0.6.0 * * @param <O> Object type for kmedoids and kmedians */ diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java index beab423e..210994e5 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java @@ -34,15 +34,18 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** * Initialize K-means by using the first k objects as initial means. * * @author Erich Schubert + * @since 0.4.0 * * @param <O> Object type for KMedoids */ +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.FirstKInitialMeans") public class FirstKInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> { /** * Constructor. @@ -84,4 +87,4 @@ public class FirstKInitialMeans<O> implements KMeansInitialization<NumberVector> return new FirstKInitialMeans<>(); } } -}
\ No newline at end of file +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java index 55ffb65e..c1cd71a0 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java @@ -33,6 +33,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio * Interface for initializing K-Means * * @author Erich Schubert + * @since 0.2 * * @apiviz.landmark * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java index c1518d18..11bfef59 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java @@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -56,6 +57,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; * </p> * * @author Erich Schubert + * @since 0.5.0 * * @param <O> Vector type */ @@ -63,6 +65,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; title = "k-means++: the advantages of careful seeding", // booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", // url = "http://dx.doi.org/10.1145/1283383.1283494") +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansPlusPlusInitialMeans") public class KMeansPlusPlusInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> { /** * Constructor. @@ -232,4 +235,4 @@ public class KMeansPlusPlusInitialMeans<O> extends AbstractKMeansInitialization< return new KMeansPlusPlusInitialMeans<>(rnd); } } -}
\ No newline at end of file +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java index 0ae3723e..2dc11c52 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; * this initialization will only return members of the original data set. * * @author Erich Schubert + * @since 0.5.0 * * @param <V> Object type */ diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java index 396d79e6..ea11330e 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; @@ -57,12 +58,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; * </p> * * @author Erich Schubert + * @since 0.5.0 * * @param <O> Object type for KMedoids initialization */ @Reference(title = "Clustering my means of Medoids", // authors = "Kaufman, L. and Rousseeuw, P.J.", // booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.PAMInitialMeans") public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> { /** * Class logger. @@ -78,6 +81,9 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K @Override public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, NumberVectorDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) { + if(relation.size() < k) { + throw new AbortException("Database has less than k objects."); + } // Ugly cast; but better than code duplication. @SuppressWarnings("unchecked") Relation<O> rel = (Relation<O>) relation; @@ -97,38 +103,33 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) { ArrayModifiableDBIDs medids = DBIDUtil.newArray(k); DBIDVar bestid = DBIDUtil.newVar(); - WritableDoubleDataStore mindist = null; + // We need three temporary storage arrays: + WritableDoubleDataStore mindist, bestd, tempd; + mindist = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + bestd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + tempd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); // First mean is chosen by having the smallest distance sum to all others. { double best = Double.POSITIVE_INFINITY; - WritableDoubleDataStore newd = null; FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial mean", ids.size(), LOG) : null; for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - if(newd == null) { - newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - } - int sum = 0; + double sum = 0, d; for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - double d = distQ.distance(iter, iter2); - sum += d; - newd.putDouble(iter2, d); + sum += d = distQ.distance(iter, iter2); + tempd.putDouble(iter2, d); } if(sum < best) { best = sum; bestid.set(iter); - if(mindist != null) { - mindist.destroy(); - } - mindist = newd; - newd = null; + // Swap mindist and newd: + WritableDoubleDataStore temp = mindist; + mindist = tempd; + tempd = temp; } LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); - if(newd != null) { - newd.destroy(); - } medids.add(bestid); } assert(mindist != null); @@ -138,44 +139,40 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K LOG.incrementProcessed(prog); // First one was just chosen. for(int i = 1; i < k; i++) { double best = Double.POSITIVE_INFINITY; - WritableDoubleDataStore bestd = null, newd = null; + bestid.unset(); for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { if(medids.contains(iter)) { continue; } - if(newd == null) { - newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - } - double sum = 0.; + double sum = 0., v; for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - double v = MathUtil.min(distQ.distance(iter, iter2), mindist.doubleValue(iter2)); - sum += v; - newd.put(iter2, v); + sum += v = MathUtil.min(distQ.distance(iter, iter2), mindist.doubleValue(iter2)); + tempd.put(iter2, v); } if(sum < best) { best = sum; bestid.set(iter); - if(bestd != null) { - bestd.destroy(); - } - bestd = newd; - newd = null; + // Swap bestd and newd: + WritableDoubleDataStore temp = bestd; + bestd = tempd; + tempd = temp; } } - if(bestd == null) { + if(!bestid.isSet()) { throw new AbortException("No median found that improves the criterion function?!? Too many infinite distances."); } medids.add(bestid); - if(newd != null) { - newd.destroy(); - } - mindist.destroy(); - mindist = bestd; + // Swap bestd and mindist: + WritableDoubleDataStore temp = bestd; + bestd = mindist; + mindist = temp; LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); mindist.destroy(); + bestd.destroy(); + tempd.destroy(); return medids; } @@ -192,4 +189,4 @@ public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, K return new PAMInitialMeans<>(); } } -}
\ No newline at end of file +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java index ca8a612a..b01651b8 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java @@ -38,12 +38,13 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.VectorListParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleArrayListParameter; /** * Run k-means with prespecified initial means. * * @author Erich Schubert + * @since 0.7.0 */ public class PredefinedInitialMeans extends AbstractKMeansInitialization<NumberVector> { /** @@ -147,9 +148,12 @@ public class PredefinedInitialMeans extends AbstractKMeansInitialization<NumberV @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - VectorListParameter meansP = new VectorListParameter(INITIAL_MEANS); + DoubleArrayListParameter meansP = new DoubleArrayListParameter(INITIAL_MEANS); if(config.grab(meansP)) { - initialMeans = meansP.getValue(); + initialMeans = new ArrayList<>(meansP.getValue().size()); + for(double[] v : meansP.getValue()) { + initialMeans.add(new Vector(v)); + } } } diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java index b6cfad8d..bb03b278 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; /** @@ -51,12 +52,14 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * available in Biometrics). * * @author Erich Schubert + * @since 0.4.0 * * @param <O> Vector type */ @Reference(authors = "E. W. Forgy", // title = "Cluster analysis of multivariate data: efficiency versus interpretability of classifications", // booktitle = "Biometrics 21(3)") +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyChosenInitialMeans") public class RandomlyChosenInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> { /** * Constructor. @@ -95,4 +98,4 @@ public class RandomlyChosenInitialMeans<O> extends AbstractKMeansInitialization< return new RandomlyChosenInitialMeans<>(rnd); } } -}
\ No newline at end of file +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java index bc2181e8..30d1b00f 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java @@ -33,13 +33,16 @@ import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.Alias; /** * Initialize k-means by generating random vectors (within the data sets value * range). * * @author Erich Schubert + * @since 0.5.0 */ +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyGeneratedInitialMeans") public class RandomlyGeneratedInitialMeans extends AbstractKMeansInitialization<NumberVector> { /** * Constructor. @@ -84,4 +87,4 @@ public class RandomlyGeneratedInitialMeans extends AbstractKMeansInitialization< return new RandomlyGeneratedInitialMeans(rnd); } } -}
\ No newline at end of file +} diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java index ac7f7f27..f5ed00f1 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; @@ -53,9 +54,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * Initialize k-means by running k-means on a sample of the data set only. * * @author Erich Schubert + * @since 0.6.0 * * @param <V> Vector type */ +@Alias("de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.SampleKMeansInitialization") public class SampleKMeansInitialization<V extends NumberVector> extends AbstractKMeansInitialization<V> { /** * Variant of kMeans to use for initialization. diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java index c0e34f12..f2b9db1f 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java @@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.parallel.processor.Processor; * Parallel k-means implementation. * * @author Erich Schubert + * @since 0.7.0 * * @apiviz.has Instance */ diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java index a10d45ae..e214581a 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java @@ -51,6 +51,7 @@ import de.lmu.ifi.dbs.elki.parallel.ParallelExecutor; * Parallel implementation of k-Means clustering. * * @author Erich Schubert + * @since 0.7.0 * * @apiviz.has KMeansProcessor * diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java index 721b74b6..cc4c00ea 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java @@ -65,6 +65,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Tibor Goldschwendt * @author Erich Schubert + * @since 0.7.0 */ @Reference(authors = "D. Pelleg, A. Moore", // booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", // diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java index e1e72b51..aafd6173 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java @@ -51,6 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Tibor Goldschwendt * @author Erich Schubert + * @since 0.2 */ @Reference(authors = "H. Akaike", // title = "On entropy maximization principle", // diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java index 8b21753d..22ad56d9 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java @@ -52,6 +52,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Tibor Goldschwendt * @author Erich Schubert + * @since 0.6.0 */ @Reference(authors = "G. Schwarz", // title = "Estimating the dimension of a model", // diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java index 755d7137..6a18269a 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Tibor Goldschwendt * @author Erich Schubert + * @since 0.2 */ @Reference(authors = "Q. Zhao, M. Xu, P. Fränti", // title = "Knee Point Detection on Bayesian Information Criterion", // diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java index bd0f12a1..100c2dfc 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio * Important note: some measures are ascending, others are descending! * * @author Erich Schubert + * @since 0.2 * * @param <O> Input Object restriction type */ diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java index cd7a4d14..e7069ac0 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java @@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio * The average of all average pairwise distances in a cluster. * * @author Stephan Baier + * @since 0.6.0 */ public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector> { @Override diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java index 09017413..fbd04dbd 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunctio * * @author Stephan Baier * @author Erich Schubert + * @since 0.6.0 */ public class WithinClusterVarianceQualityMeasure extends AbstractKMeansQualityMeasure<NumberVector> { @Override |