summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java224
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java70
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java244
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java137
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java31
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java71
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java64
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java70
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java295
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java40
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java163
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java130
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java393
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java)12
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java)382
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java176
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java)172
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java)14
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java)164
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java)7
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java)47
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java161
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java)166
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java)29
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java (renamed from src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java)63
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java272
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java154
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java239
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java74
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java77
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java67
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java65
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java61
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java2
39 files changed, 3212 insertions, 1219 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index 5754e961..bdfc2f04 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,14 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
*/
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.RandomlyChosenInitialMeans;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
@@ -43,11 +46,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
@@ -60,28 +62,26 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
- * @apiviz.has MeanModel
* @apiviz.composedOf KMeansInitialization
*
* @param <V> Vector type
- * @param <D> Distance type
* @param <M> Cluster model type
*/
-public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans<V, D, M>, ClusteringAlgorithm<Clustering<M>> {
+public abstract class AbstractKMeans<V extends NumberVector, M extends Model> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector, Clustering<M>> implements KMeans<V, M>, ClusteringAlgorithm<Clustering<M>> {
/**
- * Holds the value of {@link #K_ID}.
+ * Number of cluster centers to initialize.
*/
protected int k;
/**
- * Holds the value of {@link #MAXITER_ID}.
+ * Maximum number of iterations
*/
protected int maxiter;
/**
* Method to choose initial means.
*/
- protected KMeansInitialization<V> initializer;
+ protected KMeansInitialization<? super V> initializer;
/**
* Constructor.
@@ -91,7 +91,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param maxiter Maxiter parameter
* @param initializer Function to generate the initial means
*/
- public AbstractKMeans(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ public AbstractKMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
super(distanceFunction);
this.k = k;
this.maxiter = maxiter;
@@ -106,43 +106,26 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param means a list of k means
* @param clusters cluster assignment
* @param assignment Current cluster assignment
+ * @param varsum Variance sum output
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
+ protected boolean assignToNearestCluster(Relation<? extends V> relation, List<? extends NumberVector> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum) {
boolean changed = false;
-
- if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
- @SuppressWarnings("unchecked")
- final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(iditer);
- int minIndex = 0;
- for(int i = 0; i < k; i++) {
- double dist = df.doubleDistance(fv, means.get(i));
- if(dist < mindist) {
- minIndex = i;
- mindist = dist;
- }
+ Arrays.fill(varsum, 0.);
+ final PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction();
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.distance(fv, means.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
}
- changed |= updateAssignment(iditer, clusters, assignment, minIndex);
- }
- }
- else {
- final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(iditer);
- int minIndex = 0;
- for(int i = 0; i < k; i++) {
- D dist = df.distance(fv, means.get(i));
- if(dist.compareTo(mindist) < 0) {
- minIndex = i;
- mindist = dist;
- }
- }
- changed |= updateAssignment(iditer, clusters, assignment, minIndex);
}
+ varsum[minIndex] += mindist;
+ changed |= updateAssignment(iditer, clusters, assignment, minIndex);
}
return changed;
}
@@ -173,7 +156,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param database the database containing the vectors
* @return the mean vectors of the given clusters in the given database
*/
- protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) {
+ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector> means, Relation<V> database) {
// TODO: use Kahan summation for better numerical precision?
List<Vector> newMeans = new ArrayList<>(k);
for(int i = 0; i < k; i++) {
@@ -187,7 +170,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
iter.advance();
// Update with remaining instances
for(; iter.valid(); iter.advance()) {
- NumberVector<?> vec = database.get(iter);
+ NumberVector vec = database.get(iter);
for(int j = 0; j < mean.getDimensionality(); j++) {
raw[j] += vec.doubleValue(j);
}
@@ -211,24 +194,23 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param database the database containing the vectors
* @return the mean vectors of the given clusters in the given database
*/
- protected List<NumberVector<?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> medians, Relation<V> database) {
+ protected List<Vector> medians(List<? extends ModifiableDBIDs> clusters, List<Vector> medians, Relation<V> database) {
final int dim = medians.get(0).getDimensionality();
final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database);
- List<NumberVector<?>> newMedians = new ArrayList<>(k);
+ List<Vector> newMedians = new ArrayList<>(k);
for(int i = 0; i < k; i++) {
ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i));
- if(list.size() > 0) {
- Vector mean = new Vector(dim);
- for(int d = 0; d < dim; d++) {
- sorter.setDimension(d);
- DBID id = QuickSelect.median(list, sorter);
- mean.set(d, database.get(id).doubleValue(d));
- }
- newMedians.add(mean);
+ if(list.size() <= 0) {
+ newMedians.add(medians.get(i));
+ continue;
}
- else {
- newMedians.add((NumberVector<?>) medians.get(i));
+ Vector mean = new Vector(dim);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d);
+ DBID id = QuickSelect.median(list, sorter);
+ mean.set(d, database.get(id).doubleValue(d));
}
+ newMedians.add(mean);
}
return newMedians;
}
@@ -256,49 +238,30 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param means Means
* @param clusters Clusters
* @param assignment Current cluster assignment
+ * @param varsum Variance sum output
* @return true when the means have changed
*/
- protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
+ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum) {
boolean changed = false;
-
- if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
- // Raw distance function
- @SuppressWarnings("unchecked")
- final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
-
- // Incremental update
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(iditer);
- int minIndex = 0;
- for(int i = 0; i < k; i++) {
- double dist = df.doubleDistance(fv, means.get(i));
- if(dist < mindist) {
- minIndex = i;
- mindist = dist;
- }
- }
- changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
- }
- }
- else {
- // Raw distance function
- final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
-
- // Incremental update
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(iditer);
- int minIndex = 0;
- for(int i = 0; i < k; i++) {
- D dist = df.distance(fv, means.get(i));
- if(dist.compareTo(mindist) < 0) {
- minIndex = i;
- mindist = dist;
- }
+ Arrays.fill(varsum, 0.);
+
+ // Raw distance function
+ final PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction();
+
+ // Incremental update
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.distance(fv, means.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
}
- changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
}
+ varsum[minIndex] += mindist;
+ changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
}
return changed;
}
@@ -339,18 +302,36 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
}
@Override
- public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) {
+ public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction) {
this.distanceFunction = distanceFunction;
}
/**
+ * Log statistics on the variance sum.
+ *
+ * @param varstat Statistics log instance
+ * @param varsum Variance sum per cluster
+ */
+ protected void logVarstat(DoubleStatistic varstat, double[] varsum) {
+ if(varstat == null) {
+ return;
+ }
+ double s = 0.;
+ for(double v : varsum) {
+ s += v;
+ }
+ varstat.setDouble(s);
+ getLogger().statistics(varstat);
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- public abstract static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> {
+ public abstract static class Parameterizer<V extends NumberVector> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector> {
/**
* k Parameter.
*/
@@ -368,25 +349,58 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
@Override
protected void makeOptions(Parameterization config) {
- ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
- if(config.grab(distanceFunctionP)) {
- distanceFunction = distanceFunctionP.instantiateClass(config);
- if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
- getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
- }
- }
+ getParameterK(config);
+ getParameterInitialization(config);
+ getParameterDistanceFunction(config);
+ getParameterMaxIter(config);
+ }
+ /**
+ * Get the k parameter.
+ *
+ * @param config Parameterization
+ */
+ protected void getParameterK(Parameterization config) {
IntParameter kP = new IntParameter(K_ID);
kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
k = kP.getValue();
}
+ }
+ /**
+ * Get the distance function parameter.
+ *
+ * @param config Parameterization
+ */
+ protected void getParameterDistanceFunction(Parameterization config) {
+ ObjectParameter<PrimitiveDistanceFunction<NumberVector>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
+ if(config.grab(distanceFunctionP)) {
+ distanceFunction = distanceFunctionP.instantiateClass(config);
+ if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
+ getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
+ }
+ }
+ }
+
+ /**
+ * Get the initialization method parameter.
+ *
+ * @param config Parameterization
+ */
+ protected void getParameterInitialization(Parameterization config) {
ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class);
if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
+ }
+ /**
+ * Get the max iterations parameter.
+ *
+ * @param config Parameterization
+ */
+ protected void getParameterMaxIter(Parameterization config) {
IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(maxiterP)) {
@@ -402,6 +416,6 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
abstract protected Logging getLogger();
@Override
- abstract protected AbstractKMeans<V, D, ?> makeInstance();
+ abstract protected AbstractKMeans<V, ?> makeInstance();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
index 51e7ace9..1c9c6a71 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
/*
This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -32,7 +32,6 @@ import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -50,10 +49,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Erich Schubert
*
* @param <V> Vector type
- * @param <D> Distance type
* @param <M> Model type
*/
-public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> {
+public class BestOfMultipleKMeans<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, M> {
/**
* The logger for this class.
*/
@@ -67,12 +65,12 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
/**
* Variant of kMeans for the bisecting step.
*/
- private KMeans<V, D, M> innerkMeans;
+ private KMeans<V, M> innerkMeans;
/**
* Quality measure which should be used.
*/
- private KMeansQualityMeasure<? super V, ? super D> qualityMeasure;
+ private KMeansQualityMeasure<? super V> qualityMeasure;
/**
* Constructor.
@@ -81,7 +79,7 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
* @param innerkMeans K-Means variant to actually use.
* @param qualityMeasure Quality measure
*/
- public BestOfMultipleKMeans(int trials, KMeans<V, D, M> innerkMeans, KMeansQualityMeasure<? super V, ? super D> qualityMeasure) {
+ public BestOfMultipleKMeans(int trials, KMeans<V, M> innerkMeans, KMeansQualityMeasure<? super V> qualityMeasure) {
super();
this.trials = trials;
this.innerkMeans = innerkMeans;
@@ -93,34 +91,27 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
if(!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass());
}
- final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction();
+ @SuppressWarnings("unchecked")
+ final PrimitiveDistanceFunction<? super NumberVector> df = (PrimitiveDistanceFunction<? super NumberVector>) innerkMeans.getDistanceFunction();
+
Clustering<M> bestResult = null;
- if(trials > 1) {
- double bestCost = Double.POSITIVE_INFINITY;
- FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
- for(int i = 0; i < trials; i++) {
- Clustering<M> currentCandidate = innerkMeans.run(database, relation);
- double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation);
-
- if(LOG.isVerbose()) {
- LOG.verbose("Cost of candidate " + i + ": " + currentCost);
- }
-
- if(currentCost < bestCost) {
- bestResult = currentCandidate;
- bestCost = currentCost;
- }
- if(prog != null) {
- prog.incrementProcessed(LOG);
- }
+ double bestCost = Double.NaN;
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
+ for(int i = 0; i < trials; i++) {
+ Clustering<M> currentCandidate = innerkMeans.run(database, relation);
+ double currentCost = qualityMeasure.quality(currentCandidate, df, relation);
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Cost of candidate " + i + ": " + currentCost);
}
- if(prog != null) {
- prog.ensureCompleted(LOG);
+
+ if(qualityMeasure.isBetter(currentCost, bestCost)) {
+ bestResult = currentCandidate;
+ bestCost = currentCost;
}
+ LOG.incrementProcessed(prog);
}
- else {
- bestResult = innerkMeans.run(database);
- }
+ LOG.ensureCompleted(prog);
return bestResult;
}
@@ -131,7 +122,7 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
}
@Override
- public DistanceFunction<? super V, D> getDistanceFunction() {
+ public DistanceFunction<? super V> getDistanceFunction() {
return innerkMeans.getDistanceFunction();
}
@@ -141,7 +132,7 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
}
@Override
- public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) {
+ public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction) {
innerkMeans.setDistanceFunction(distanceFunction);
}
@@ -159,10 +150,9 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
* @apiviz.exclude
*
* @param <V> Vector type
- * @param <D> Distance type
* @param <M> Model type
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer {
/**
* Parameter to specify the iterations of the bisecting step.
*/
@@ -186,12 +176,12 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
/**
* Variant of kMeans to use.
*/
- protected KMeans<V, D, M> kMeansVariant;
+ protected KMeans<V, M> kMeansVariant;
/**
* Quality measure.
*/
- protected KMeansQualityMeasure<? super V, ? super D> qualityMeasure;
+ protected KMeansQualityMeasure<? super V> qualityMeasure;
@Override
protected void makeOptions(Parameterization config) {
@@ -201,19 +191,19 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
trials = trialsP.intValue();
}
- ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
+ ObjectParameter<KMeans<V, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
if(config.grab(kMeansVariantP)) {
kMeansVariant = kMeansVariantP.instantiateClass(config);
}
- ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class);
+ ObjectParameter<KMeansQualityMeasure<V>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class);
if(config.grab(qualityMeasureP)) {
qualityMeasure = qualityMeasureP.instantiateClass(config);
}
}
@Override
- protected BestOfMultipleKMeans<V, D, M> makeInstance() {
+ protected BestOfMultipleKMeans<V, M> makeInstance() {
return new BestOfMultipleKMeans<>(trials, kMeansVariant, qualityMeasure);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java
new file mode 100644
index 00000000..0a47deba
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java
@@ -0,0 +1,244 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * Clustering Large Applications (CLARA) is a clustering method for large data
+ * sets based on PAM, partitioning around medoids ({@link KMedoidsPAM}) based on
+ * sampling.
+ *
+ * Reference:
+ * <p>
+ * L. Kaufman, P. J. Rousseeuw<br />
+ * Clustering Large Data Sets (with discussion)<br />
+ * in: Pattern Recognition in Practice II
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+@Reference(authors = "L. Kaufman, P. J. Rousseeuw",//
+title = "Clustering Large Data Sets (with discussion)", //
+booktitle = "Pattern Recognition in Practice II")
+public class CLARA<V> extends KMedoidsPAM<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(CLARA.class);
+
+ /**
+ * Sampling rate. If less than 1, it is considered to be a relative value.
+ */
+ double sampling;
+
+ /**
+ * Number of samples to draw (i.e. iterations).
+ */
+ int numsamples;
+
+ /**
+ * Random factory for initialization.
+ */
+ RandomFactory random;
+
+ public CLARA(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer, int numsamples, double sampling, RandomFactory random) {
+ super(distanceFunction, k, maxiter, initializer);
+ this.numsamples = numsamples;
+ this.sampling = sampling;
+ this.random = random;
+ }
+
+ @Override
+ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("CLARA Clustering", "clara-clustering");
+ }
+ DBIDs ids = relation.getDBIDs();
+ int sampleSize = (int) ((sampling < 1.) ? sampling * ids.size() : sampling);
+ DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+
+ double best = Double.POSITIVE_INFINITY;
+ ArrayModifiableDBIDs bestmedoids = null;
+ List<ModifiableDBIDs> bestclusters = null;
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Random samples.", numsamples, LOG) : null;
+ for(int j = 0; j < numsamples; j++) {
+ DBIDs rids = DBIDUtil.randomSample(ids, sampleSize, random);
+ // Choose initial medoids
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, rids, distQ));
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+ runPAMOptimization(distQ, rids, medoids, clusters);
+ double score = assignRemainingToNearestCluster(medoids, ids, rids, clusters, distQ);
+ if(score < best) {
+ best = score;
+ bestmedoids = medoids;
+ bestclusters = clusters;
+ }
+ LOG.incrementProcessed(prog);
+ }
+ LOG.ensureCompleted(prog);
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<>("CLARA Clustering", "clara-clustering");
+ for(int i = 0; i < bestclusters.size(); i++) {
+ MedoidModel model = new MedoidModel(bestmedoids.get(i));
+ result.addToplevelCluster(new Cluster<>(bestclusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param means Object centroids
+ * @param ids Object ids
+ * @param rids Sample that was already assigned
+ * @param clusters cluster assignment
+ * @param distQ distance query
+ * @return Sum of distances.
+ */
+ protected double assignRemainingToNearestCluster(ArrayDBIDs means, DBIDs ids, DBIDs rids, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) {
+ rids = DBIDUtil.ensureSet(rids); // Ensure we have fast contains
+ double distsum = 0.;
+ DBIDArrayIter miter = means.iter();
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(rids.contains(iditer)) {
+ continue;
+ }
+ double mindist = Double.POSITIVE_INFINITY;
+ int minIndex = 0;
+ miter.seek(0); // Reuse iterator.
+ for(int i = 0; miter.valid(); miter.advance(), i++) {
+ double dist = distQ.distance(iditer, miter);
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ distsum += mindist;
+ clusters.get(minIndex).add(iditer);
+ }
+ return distsum;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V> extends KMedoidsPAM.Parameterizer<V> {
+ /**
+ * The number of samples to run.
+ */
+ public static final OptionID NUMSAMPLES_ID = new OptionID("clara.samples", "Number of samples (iterations) to run.");
+
+ /**
+ * The sample size.
+ */
+ public static final OptionID SAMPLESIZE_ID = new OptionID("clara.samplesize", "The size of the sample.");
+
+ /**
+ * Random generator.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("clara.random", "Random generator seed.");
+
+ /**
+ * Sampling rate. If less than 1, it is considered to be a relative value.
+ */
+ double sampling;
+
+ /**
+ * Number of samples to draw (i.e. iterations).
+ */
+ int numsamples;
+
+ /**
+ * Random factory for initialization.
+ */
+ RandomFactory random;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter numsamplesP = new IntParameter(NUMSAMPLES_ID, 5) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(numsamplesP)) {
+ numsamples = numsamplesP.intValue();
+ }
+
+ DoubleParameter samplingP = new DoubleParameter(SAMPLESIZE_ID) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(samplingP)) {
+ sampling = samplingP.doubleValue();
+ }
+
+ RandomParameter randomP = new RandomParameter(RANDOM_ID);
+ if(config.grab(randomP)) {
+ random = randomP.getValue();
+ }
+ }
+
+ @Override
+ protected CLARA<V> makeInstance() {
+ return new CLARA<>(distanceFunction, k, maxiter, initializer, numsamples, sampling, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
index 29c0a5c8..32d7a978 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,11 +27,10 @@ import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
/**
@@ -40,10 +39,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
* @author Erich Schubert
*
* @param <V> Number vector type
- * @param <D> Distance type
* @param <M> Actual model type
*/
-public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V, D> {
+public interface KMeans<V extends NumberVector, M extends Model> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V> {
/**
* Parameter to specify the initialization method
*/
@@ -81,11 +79,11 @@ public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M exte
* @param k K parameter
*/
void setK(int k);
-
+
/**
* Set the distance function to use.
*
* @param distanceFunction Distance function.
*/
- void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction);
+ void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
index aec4fe0f..57c2a295 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,6 +27,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -43,13 +44,12 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -57,7 +57,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
/**
- * Provides the k-means algorithm, using Lloyd-style bulk iterations.
+ * An algorithm for k-means, using Lloyd-style bulk iterations.
*
* However, in contrast to Lloyd's k-means and similar to MacQueen, we do update
* the mean vectors multiple times, not only at the very end of the iteration.
@@ -71,9 +71,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* @apiviz.has KMeansModel
*
* @param <V> vector datatype
- * @param <D> distance value type
*/
-public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+public class KMeansBatchedLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
/**
* The logger for this class.
*/
@@ -99,26 +98,21 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
* @param blocks Number of blocks
* @param random Random factory used for partitioning.
*/
- public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer, int blocks, RandomFactory random) {
+ public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer, int blocks, RandomFactory random) {
super(distanceFunction, k, maxiter, initializer);
this.blocks = blocks;
this.random = random;
}
@Override
- public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Choose initial means
- List<? extends NumberVector<?>> mvs = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
- // Convert to (modifiable) math vectors.
- List<Vector> means = new ArrayList<>(k);
- for (NumberVector<?> m : mvs) {
- means.add(m.getColumnVector());
- }
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
@@ -127,45 +121,44 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
double[][] meanshift = new double[k][dim];
int[] changesize = new int[k];
+ double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
- for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
+ DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
boolean changed = false;
FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null;
- for (int p = 0; p < parts.length; p++) {
+ for(int p = 0; p < parts.length; p++) {
// Initialize new means scratch space.
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
Arrays.fill(meanshift[i], 0.);
}
Arrays.fill(changesize, 0);
- changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment);
+ Arrays.fill(varsum, 0.);
+ changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment, varsum);
// Recompute means.
updateMeans(means, meanshift, clusters, changesize);
- if (pprog != null) {
- pprog.incrementProcessed(LOG);
- }
- }
- if (pprog != null) {
- pprog.ensureCompleted(LOG);
+ LOG.incrementProcessed(pprog);
}
+ LOG.ensureCompleted(pprog);
+ logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
- if (!changed) {
+ if(!changed) {
break;
}
}
- if (prog != null) {
- prog.setCompleted(LOG);
- }
+ LOG.setCompleted(prog);
// Wrap result
- final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
- for (int i = 0; i < clusters.size(); i++) {
- KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
- result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ if(ids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
@@ -181,42 +174,26 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
* @param changesize New cluster sizes
* @param clusters cluster assignment
* @param assignment Current cluster assignment
+ * @param varsum Sum of variances
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector<?>> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
+ protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum) {
boolean changed = false;
- if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
- @SuppressWarnings("unchecked")
- final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
- for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
- double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(iditer);
- int minIndex = 0;
- for (int i = 0; i < k; i++) {
- double dist = df.doubleDistance(fv, oldmeans.get(i));
- if (dist < mindist) {
- minIndex = i;
- mindist = dist;
- }
- }
- changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
- }
- } else {
- final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
- for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
- D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(iditer);
- int minIndex = 0;
- for (int i = 0; i < k; i++) {
- D dist = df.distance(fv, oldmeans.get(i));
- if (dist.compareTo(mindist) < 0) {
- minIndex = i;
- mindist = dist;
- }
+ final PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction();
+ for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.distance(fv, oldmeans.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
}
- changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
}
+ varsum[minIndex] += mindist;
+ changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
}
return changed;
}
@@ -235,7 +212,7 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
*/
protected boolean updateAssignment(DBIDIter id, V fv, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[][] meanshift, int[] changesize, int minIndex) {
int cur = assignment.intValue(id);
- if (cur == minIndex) {
+ if(cur == minIndex) {
return false;
}
// Add to new cluster.
@@ -243,16 +220,16 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
clusters.get(minIndex).add(id);
changesize[minIndex]++;
double[] raw = meanshift[minIndex];
- for (int j = 0; j < fv.getDimensionality(); j++) {
+ for(int j = 0; j < fv.getDimensionality(); j++) {
raw[j] += fv.doubleValue(j);
}
}
// Remove from previous cluster
- if (cur >= 0) {
+ if(cur >= 0) {
clusters.get(cur).remove(id);
changesize[cur]--;
double[] raw = meanshift[cur];
- for (int j = 0; j < fv.getDimensionality(); j++) {
+ for(int j = 0; j < fv.getDimensionality(); j++) {
raw[j] -= fv.doubleValue(j);
}
}
@@ -269,16 +246,16 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
* @param changesize Size of change (for weighting!)
*/
protected void updateMeans(List<Vector> means, double[][] meanshift, List<ModifiableDBIDs> clusters, int[] changesize) {
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
int newsize = clusters.get(i).size(), oldsize = newsize - changesize[i];
- if (newsize == 0) {
+ if(newsize == 0) {
continue; // Keep previous mean vector.
}
- if (oldsize == 0) {
+ if(oldsize == 0) {
means.set(i, new Vector(meanshift[i]).times(1. / newsize));
continue; // Replace with new vector.
}
- if (oldsize == newsize) {
+ if(oldsize == newsize) {
means.get(i).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize);
continue;
}
@@ -298,7 +275,7 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
/**
* Parameter for the number of blocks.
*/
@@ -324,11 +301,11 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
super.makeOptions(config);
IntParameter blocksP = new IntParameter(BLOCKS_ID, 10);
blocksP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
- if (config.grab(blocksP)) {
+ if(config.grab(blocksP)) {
blocks = blocksP.intValue();
}
RandomParameter randomP = new RandomParameter(RANDOM_ID);
- if (config.grab(randomP)) {
+ if(config.grab(randomP)) {
random = randomP.getValue();
}
}
@@ -339,7 +316,7 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>
}
@Override
- protected KMeansBatchedLloyd<V, D> makeInstance() {
+ protected KMeansBatchedLloyd<V> makeInstance() {
return new KMeansBatchedLloyd<>(distanceFunction, k, maxiter, initializer, blocks, random);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
index 80a581b1..c6361550 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
/*
This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -35,7 +35,6 @@ import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
@@ -63,11 +62,10 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Stephan Baier
*
* @param <V> Vector type
- * @param <D> Distance type
* @param <M> Model type
*/
@Reference(authors = "M. Steinbach, G. Karypis, V. Kumar", title = "A Comparison of Document Clustering Techniques", booktitle = "KDD workshop on text mining. Vol. 400. No. 1")
-public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> {
+public class KMeansBisecting<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, M> {
/**
* The logger for this class.
*/
@@ -76,7 +74,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
/**
* Variant of kMeans for the bisecting step.
*/
- private KMeans<V, D, M> innerkMeans;
+ private KMeans<V, M> innerkMeans;
/**
* Desired value of k.
@@ -89,7 +87,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
* @param k k parameter - number of result clusters
* @param innerkMeans KMeans variant parameter - for bisecting step
*/
- public KMeansBisecting(int k, KMeans<V, D, M> innerkMeans) {
+ public KMeansBisecting(int k, KMeans<V, M> innerkMeans) {
super();
this.k = k;
this.innerkMeans = innerkMeans;
@@ -128,16 +126,12 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
// Add resulting clusters to current result.
currentClusterList.addAll(innerResult.getAllClusters());
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(prog);
if (LOG.isVerbose()) {
LOG.verbose("Iteration " + j);
}
}
- if (prog != null) {
- prog.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(prog);
// add all current clusters to the result
Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
@@ -153,7 +147,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
}
@Override
- public DistanceFunction<? super V, D> getDistanceFunction() {
+ public DistanceFunction<? super V> getDistanceFunction() {
return innerkMeans.getDistanceFunction();
}
@@ -163,7 +157,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
}
@Override
- public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) {
+ public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction) {
innerkMeans.setDistanceFunction(distanceFunction);
}
@@ -181,10 +175,9 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
* @apiviz.exclude
*
* @param <V> Vector type
- * @param <D> Distance type
* @param <M> Model type
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer {
/**
* Parameter to specify the kMeans variant.
*/
@@ -193,7 +186,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
/**
* Variant of kMeans
*/
- protected KMeans<V, D, M> kMeansVariant;
+ protected KMeans<V, M> kMeansVariant;
/**
* Desired number of clusters.
@@ -210,7 +203,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
k = kP.intValue();
}
- ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class);
+ ObjectParameter<KMeans<V, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class);
if (config.grab(kMeansVariantP)) {
ListParameterization kMeansVariantParameters = new ListParameterization();
@@ -224,7 +217,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
}
@Override
- protected KMeansBisecting<V, D, M> makeInstance() {
+ protected KMeansBisecting<V, M> makeInstance() {
return new KMeansBisecting<>(k, kMeansVariant);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
index 2a60ef27..a978745a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -35,28 +36,26 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
/**
- * Provides the k-means algorithm, alternating between MacQueen-style
- * incremental processing and Lloyd-Style batch steps.
+ * A hybrid k-means algorithm, alternating between MacQueen-style incremental
+ * processing and Lloyd-Style batch steps.
*
* @author Erich Schubert
*
- * @apiviz.landmark
* @apiviz.has KMeansModel
*
* @param <V> vector datatype
- * @param <D> distance value type
*/
-public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+public class KMeansHybridLloydMacQueen<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
/**
* The logger for this class.
*/
@@ -70,61 +69,59 @@ public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Dist
* @param maxiter Maxiter parameter
* @param initializer Initialization method
*/
- public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
super(distanceFunction, k, maxiter, initializer);
}
@Override
- public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = new ArrayList<>(k);
- for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) {
- means.add(nv.getColumnVector());
- }
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
- for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) {
+ DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) {
{ // MacQueen
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
- boolean changed = macQueenIterate(relation, means, clusters, assignment);
- if (!changed) {
+ LOG.incrementProcessed(prog);
+ boolean changed = macQueenIterate(relation, means, clusters, assignment, varsum);
+ logVarstat(varstat, varsum);
+ if(!changed) {
break;
}
}
{ // Lloyd
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
- boolean changed = assignToNearestCluster(relation, means, clusters, assignment);
+ LOG.incrementProcessed(prog);
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
+ logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
- if (!changed) {
+ if(!changed) {
break;
}
// Recompute means.
means = means(clusters, means, relation);
}
}
- if (prog != null) {
- prog.setCompleted(LOG);
- }
+ LOG.setCompleted(prog);
// Wrap result
- final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
- for (int i = 0; i < clusters.size(); i++) {
- KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
- result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ if(ids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
@@ -141,14 +138,14 @@ public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Dist
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
@Override
protected Logging getLogger() {
return LOG;
}
@Override
- protected KMeansHybridLloydMacQueen<V, D> makeInstance() {
+ protected KMeansHybridLloydMacQueen<V> makeInstance() {
return new KMeansHybridLloydMacQueen<>(distanceFunction, k, maxiter, initializer);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index 686e2076..ed92190d 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -35,19 +36,20 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
/**
- * Provides the k-means algorithm, using Lloyd-style bulk iterations.
+ * The standard k-means algorithm, using Lloyd-style bulk iterations.
*
* <p>
* Reference:<br />
@@ -63,12 +65,14 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
* @apiviz.has KMeansModel
*
* @param <V> vector datatype
- * @param <D> distance value type
*/
@Title("K-Means")
-@Description("Finds a partitioning into k clusters.")
-@Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489")
-public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+@Description("Finds a least-squared partitioning into k clusters.")
+@Reference(authors = "S. Lloyd", //
+title = "Least squares quantization in PCM", //
+booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", //
+url = "http://dx.doi.org/10.1109/TIT.1982.1056489")
+public class KMeansLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
/**
* The logger for this class.
*/
@@ -82,47 +86,49 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
* @param maxiter Maxiter parameter
* @param initializer Initialization method
*/
- public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ public KMeansLloyd(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
super(distanceFunction, k, maxiter, initializer);
}
@Override
- public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
- for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
- boolean changed = assignToNearestCluster(relation, means, clusters, assignment);
+ DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
+ logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
- if (!changed) {
+ if(!changed) {
break;
}
// Recompute means.
means = means(clusters, means, relation);
}
- if (prog != null) {
- prog.setCompleted(LOG);
- }
+ LOG.setCompleted(prog);
// Wrap result
- final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
- for (int i = 0; i < clusters.size(); i++) {
- KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
- result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ if(ids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
@@ -139,14 +145,14 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
@Override
protected Logging getLogger() {
return LOG;
}
@Override
- protected KMeansLloyd<V, D> makeInstance() {
+ protected KMeansLloyd<V> makeInstance() {
return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index a0f4bb3f..7d2f805a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -38,18 +39,22 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
/**
- * Provides the k-means algorithm, using MacQueen style incremental updates.
+ * The original k-means algorithm, using MacQueen style incremental updates;
+ * making this effectively an "online" (streaming) algorithm.
+ *
+ * This implementation will by default iterate over the data set until
+ * convergence, although MacQueen likely only meant to do a single pass over the
+ * data.
*
* <p>
* Reference:<br />
@@ -62,12 +67,14 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
* @apiviz.has KMeansModel
*
* @param <V> vector type to use
- * @param <D> distance function value type
*/
@Title("K-Means")
-@Description("Finds a partitioning into k clusters.")
-@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992")
-public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+@Description("Finds a least-squares partitioning into k clusters.")
+@Reference(authors = "J. MacQueen", //
+title = "Some Methods for Classification and Analysis of Multivariate Observations", //
+booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", //
+url = "http://projecteuclid.org/euclid.bsmsp/1200512992")
+public class KMeansMacQueen<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
/**
* The logger for this class.
*/
@@ -81,47 +88,44 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
* @param maxiter Maxiter parameter
* @param initializer Initialization method
*/
- public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
super(distanceFunction, k, maxiter, initializer);
}
@Override
- public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = new ArrayList<>(k);
- for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) {
- means.add(nv.getColumnVector());
- }
- // Initialize cluster and assign objects
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
- // Refine result
- for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
- boolean changed = macQueenIterate(relation, means, clusters, assignment);
- if (!changed) {
+ DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
+ // Iterate MacQueen
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ boolean changed = macQueenIterate(relation, means, clusters, assignment, varsum);
+ logVarstat(varstat, varsum);
+ if(!changed) {
break;
}
}
- if (prog != null) {
- prog.setCompleted(LOG);
- }
+ LOG.setCompleted(prog);
- final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
- for (int i = 0; i < clusters.size(); i++) {
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
- KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getArrayRef()));
+ if(ids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
@@ -139,14 +143,14 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
@Override
protected Logging getLogger() {
return LOG;
}
@Override
- protected KMeansMacQueen<V, D> makeInstance() {
+ protected KMeansMacQueen<V> makeInstance() {
return new KMeansMacQueen<>(distanceFunction, k, maxiter, initializer);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
index 0a97c4d3..49b67599 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -1,147 +1,148 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import java.util.ArrayList;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.data.Cluster;
-import de.lmu.ifi.dbs.elki.data.Clustering;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
-import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
-import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
-import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-
-/**
- * Provides the k-medians clustering algorithm, using Lloyd-style bulk
- * iterations.
- *
- * Reference:
- * <p>
- * Clustering via Concave Minimization<br />
- * P. S. Bradley, O. L. Mangasarian, W. N. Street<br />
- * in: Advances in neural information processing systems
- * </p>
- *
- * @author Erich Schubert
- *
- * @param <V> vector datatype
- * @param <D> distance value type
- */
-@Title("K-Medians")
-@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url = "http://nips.djvuzone.org/djvu/nips09/0368.djvu")
-public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, MeanModel<V>> {
- /**
- * The logger for this class.
- */
- private static final Logging LOG = Logging.getLogger(KMediansLloyd.class);
-
- /**
- * Constructor.
- *
- * @param distanceFunction distance function
- * @param k k parameter
- * @param maxiter Maxiter parameter
- * @param initializer Initialization method
- */
- public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
- super(distanceFunction, k, maxiter, initializer);
- }
-
- @Override
- public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
- return new Clustering<>("k-Medians Clustering", "kmedians-clustering");
- }
- // Choose initial medians
- List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
- // Setup cluster assignment store
- List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
- }
- WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
-
- IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
- for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (prog != null) {
- prog.incrementProcessed(LOG);
- }
- boolean changed = assignToNearestCluster(relation, medians, clusters, assignment);
- // Stop if no cluster assignment changed.
- if (!changed) {
- break;
- }
- // Recompute medians.
- medians = medians(clusters, medians, relation);
- }
- if (prog != null) {
- prog.setCompleted(LOG);
- }
- // Wrap result
- final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<MeanModel<V>> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering");
- for (int i = 0; i < clusters.size(); i++) {
- MeanModel<V> model = new MeanModel<>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef()));
- result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
- }
- return result;
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-
- @Override
- protected KMediansLloyd<V, D> makeInstance() {
- return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer);
- }
- }
-}
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+
+/**
+ * k-medians clustering algorithm, but using Lloyd-style bulk iterations instead
+ * of the more complicated approach suggested by Kaufman and Rousseeuw (see
+ * {@link KMedoidsPAM} instead).
+ *
+ * Reference:
+ * <p>
+ * Clustering via Concave Minimization<br />
+ * P. S. Bradley, O. L. Mangasarian, W. N. Street<br />
+ * in: Advances in Neural Information Processing Systems (NIPS)
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector datatype
+ */
+@Title("K-Medians")
+@Reference(title = "Clustering via Concave Minimization", //
+authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", //
+booktitle = "Advances in Neural Information Processing Systems", //
+url = "https://papers.nips.cc/paper/1260-clustering-via-concave-minimization.pdf")
+public class KMediansLloyd<V extends NumberVector> extends AbstractKMeans<V, MeanModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMediansLloyd.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ */
+ public KMediansLloyd(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ @Override
+ public Clustering<MeanModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("k-Medians Clustering", "kmedians-clustering");
+ }
+ // Choose initial medians
+ List<Vector> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] distsum = new double[k];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ boolean changed = assignToNearestCluster(relation, medians, clusters, assignment, distsum);
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute medians.
+ medians = medians(clusters, medians, relation);
+ }
+ LOG.setCompleted(prog);
+ // Wrap result
+ Clustering<MeanModel> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MeanModel model = new MeanModel(medians.get(i));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected KMediansLloyd<V> makeInstance() {
+ return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
index 41cca225..d2f98466 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,8 +27,9 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.FarthestPointsInitialMeans;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
@@ -43,8 +44,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.Mean;
@@ -54,8 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
- * Provides the k-medoids clustering algorithm, using a "bulk" variation of the
- * "Partitioning Around Medoids" approach.
+ * A k-medoids clustering algorithm, implemented as EM-style bulk algorithm.
*
* In contrast to PAM, which will in each iteration update one medoid with one
* (arbitrary) non-medoid, this implementation follows the EM pattern. In the
@@ -72,9 +71,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @apiviz.composedOf KMedoidsInitialization
*
* @param <V> vector datatype
- * @param <D> distance value type
*/
-public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+public class KMedoidsEM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
/**
* The logger for this class.
*/
@@ -103,7 +101,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
* @param maxiter Maxiter parameter
* @param initializer Function to generate the initial means
*/
- public KMedoidsEM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ public KMedoidsEM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
super(distanceFunction);
this.k = k;
this.maxiter = maxiter;
@@ -121,9 +119,9 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
- DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
// Choose initial medoids
- ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for(int i = 0; i < k; i++) {
@@ -139,9 +137,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
// Swap phase
boolean changed = true;
while(changed) {
- if(prog != null) {
- prog.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(prog);
changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
@@ -154,7 +150,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
}
Mean mdist = new Mean();
for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
- mdist.put(distQ.distance(iter, iter2).doubleValue());
+ mdist.put(distQ.distance(iter, iter2));
}
if(mdist.getMean() < bestm.getMean()) {
best = DBIDUtil.deref(iter);
@@ -172,9 +168,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
assignToNearestCluster(medoids, mdists, clusters, distQ);
}
}
- if(prog != null) {
- prog.setCompleted(LOG);
- }
+ LOG.setCompleted(prog);
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
@@ -195,7 +189,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
* @param distQ distance query
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) {
boolean changed = false;
double[] dists = new double[k];
@@ -205,7 +199,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
{
int i = 0;
for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
- dists[i] = distQ.distance(iditer, miter).doubleValue();
+ dists[i] = distQ.distance(iditer, miter);
if(dists[i] < mindist) {
minIndex = i;
mindist = dists[i];
@@ -247,7 +241,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
*
* @apiviz.exclude
*/
- public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> {
protected int k;
protected int maxiter;
@@ -263,7 +257,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
k = kP.intValue();
}
- ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, FarthestPointsInitialMeans.class);
if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
@@ -276,7 +270,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
}
@Override
- protected KMedoidsEM<V, D> makeInstance() {
+ protected KMedoidsEM<V> makeInstance() {
return new KMedoidsEM<>(distanceFunction, k, maxiter, initializer);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
index c9e1dc47..30bb56e5 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,8 +27,9 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
@@ -40,15 +41,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
@@ -59,14 +60,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
- * Provides the k-medoids clustering algorithm, using the
- * "Partitioning Around Medoids" approach.
+ * The original PAM algorithm or k-medoids clustering, as proposed by Kaufman
+ * and Rousseeuw in "Partitioning Around Medoids".
*
* Reference:
* <p>
* Clustering my means of Medoids<br />
* Kaufman, L. and Rousseeuw, P.J.<br />
- * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
+ * in: Statistical Data Analysis Based on the L1-Norm and Related Methods
* </p>
*
* @author Erich Schubert
@@ -75,23 +76,24 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @apiviz.composedOf KMedoidsInitialization
*
* @param <V> vector datatype
- * @param <D> distance value type
*/
@Title("Partioning Around Medoids")
-@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
-public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+@Reference(title = "Clustering by means of Medoids", //
+authors = "Kaufman, L. and Rousseeuw, P.J.", //
+booktitle = "Statistical Data Analysis Based on the L1-Norm and Related Methods")
+public class KMedoidsPAM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class);
/**
- * Holds the value of {@link AbstractKMeans#K_ID}.
+ * The number of clusters to produce.
*/
protected int k;
/**
- * Holds the value of {@link AbstractKMeans#MAXITER_ID}.
+ * The maximum number of iterations.
*/
protected int maxiter;
@@ -108,7 +110,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
* @param maxiter Maxiter parameter
* @param initializer Function to generate the initial means
*/
- public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ public KMedoidsPAM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
super(distanceFunction);
this.k = k;
this.maxiter = maxiter;
@@ -126,16 +128,36 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
- DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
DBIDs ids = relation.getDBIDs();
// Choose initial medoids
- ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
+ runPAMOptimization(distQ, ids, medoids, clusters);
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MedoidModel model = new MedoidModel(medoids.get(i));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Run the PAM optimization phase.
+ *
+ * @param distQ Distance query
+ * @param ids IDs to process
+ * @param medoids Medoids list
+ * @param clusters Clusters
+ */
+ protected void runPAMOptimization(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, List<ModifiableDBIDs> clusters) {
WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
// Initial assignment to nearest medoids
// TODO: reuse this information, from the build phase, when possible?
@@ -143,15 +165,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
// Swap phase
+ DBIDVar bestid = DBIDUtil.newVar();
boolean changed = true;
while(changed) {
- if(prog != null) {
- prog.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(prog);
changed = false;
// Try to swap the medoid with a better cluster member:
double best = 0;
- DBID bestid = null;
int bestcluster = -1;
int i = 0;
for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
@@ -159,30 +179,28 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
if(DBIDUtil.equal(miter, iter)) {
continue;
}
- // double disti = distQ.distance(id, med).doubleValue();
double cost = 0;
- DBIDIter olditer = medoids.iter();
- for(int j = 0; j < k; j++, olditer.advance()) {
+ DBIDIter miter2 = medoids.iter();
+ for(int j = 0; j < k; j++, miter2.advance()) {
for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
- double distcur = distQ.distance(iter2, olditer).doubleValue();
- double distnew = distQ.distance(iter2, iter).doubleValue();
+ if(DBIDUtil.equal(miter2, iter2)) {
+ continue;
+ }
+ double distcur = distQ.distance(iter2, miter2);
+ double distnew = distQ.distance(iter2, iter);
if(j == i) {
// Cases 1 and 2.
double distsec = second.doubleValue(iter2);
- if(distcur > distsec) {
- // Case 1, other would switch to a third medoid
- cost += distsec - distcur; // Always positive!
- }
- else { // Would remain with the candidate
- cost += distnew - distcur; // Could be negative
- }
+ cost += (distcur > distsec) ? //
+ // Case 1, other would switch to a third medoid
+ distsec - distcur // Always positive!
+ : // Would remain with the candidate
+ distnew - distcur; // Could be negative
}
else {
// Cases 3-4: objects from other clusters
- if(distcur < distnew) {
- // Case 3: no change
- }
- else {
+ // Case 3: is no change
+ if(distcur > distnew) {
// Case 4: would switch to new medoid
cost += distnew - distcur; // Always negative
}
@@ -191,18 +209,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
}
if(cost < best) {
best = cost;
- bestid = DBIDUtil.deref(iter);
+ bestid.set(iter);
bestcluster = i;
}
}
}
- if(prog != null) {
- prog.setCompleted(LOG);
- }
if(LOG.isDebugging()) {
LOG.debug("Best cost: " + best);
}
- if(bestid != null) {
+ if(best < 0.) {
changed = true;
medoids.set(bestcluster, bestid);
}
@@ -212,14 +227,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
assignToNearestCluster(medoids, ids, second, clusters, distQ);
}
}
-
- // Wrap result
- Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
- for(int i = 0; i < clusters.size(); i++) {
- MedoidModel model = new MedoidModel(medoids.get(i));
- result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
- }
- return result;
+ LOG.setCompleted(prog);
}
/**
@@ -233,36 +241,32 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
* @param distQ distance query
* @return true when any object was reassigned
*/
- protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) {
boolean changed = false;
+ DBIDArrayIter miter = means.iter();
for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY, mindist2 = Double.POSITIVE_INFINITY;
int minIndex = 0;
- double mindist = Double.POSITIVE_INFINITY;
- double mindist2 = Double.POSITIVE_INFINITY;
- {
- int i = 0;
- for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
- double dist = distQ.distance(iditer, miter).doubleValue();
- if(dist < mindist) {
- minIndex = i;
- mindist2 = mindist;
- mindist = dist;
- }
- else if(dist < mindist2) {
- mindist2 = dist;
- }
+ miter.seek(0); // Reuse iterator.
+ for(int i = 0; miter.valid(); miter.advance(), i++) {
+ double dist = distQ.distance(iditer, miter);
+ if(dist < mindist) {
+ minIndex = i;
+ mindist2 = mindist;
+ mindist = dist;
+ }
+ else if(dist < mindist2) {
+ mindist2 = dist;
}
}
if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for(int i = 0; i < k; i++) {
- if(i != minIndex) {
- if(clusters.get(i).remove(iditer)) {
- break;
- }
+ for(int j = 0; j < k; j++) {
+ if(j != minIndex && clusters.get(j).remove(iditer)) {
+ break;
}
}
}
@@ -288,18 +292,27 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
*
* @apiviz.exclude
*/
- public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> {
+ /**
+ * The number of clusters to produce.
+ */
protected int k;
+ /**
+ * The maximum number of iterations.
+ */
protected int maxiter;
+ /**
+ * Method to choose initial means.
+ */
protected KMedoidsInitialization<V> initializer;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ IntParameter kP = new IntParameter(KMeans.K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
k = kP.intValue();
}
@@ -309,15 +322,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
initializer = initialP.instantiateClass(config);
}
- IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0);
- maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(maxiterP)) {
maxiter = maxiterP.intValue();
}
}
@Override
- protected KMedoidsPAM<V, D> makeInstance() {
+ protected KMedoidsPAM<V> makeInstance() {
return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java
new file mode 100644
index 00000000..d63d52d4
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java
@@ -0,0 +1,130 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+
+/**
+ * Pseudo-k-Means variations, that assigns each object to the nearest center.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ */
+public class SingleAssignmentKMeans<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(SingleAssignmentKMeans.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param initializer Initialization method
+ */
+ public SingleAssignmentKMeans(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, KMeansInitialization<? super V> initializer) {
+ super(distanceFunction, k, -1, initializer);
+ }
+
+ @Override
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("k-Means Assignment", "kmeans-assignment");
+ }
+ // Choose initial means
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
+
+ assignToNearestCluster(relation, means, clusters, assignment, varsum);
+
+ // Wrap result
+ Clustering<KMeansModel> result = new Clustering<>("Nearest Centroid Clustering", "nearest-center-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // Do NOT invoke super.makeOptions, as we don't want to have the maxiter
+ // parameter, nor a warning for other distance functions.
+ getParameterDistanceFunction(config);
+ getParameterK(config);
+ getParameterInitialization(config);
+ }
+
+ @Override
+ protected SingleAssignmentKMeans<V> makeInstance() {
+ return new SingleAssignmentKMeans<>(distanceFunction, k, initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
new file mode 100644
index 00000000..4e31a6ba
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
@@ -0,0 +1,393 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PredefinedInitialMeans;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality.KMeansQualityMeasure;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters.
+ *
+ * Note: this implementation does currently <em>not</em> use a k-d-tree for
+ * acceleration. Also note that k_max is not a hard threshold - the algorithm
+ * can return up to 2*k_max clusters!
+ *
+ * Reference:<br>
+ * <p>
+ * D. Pelleg, A. Moore:<br />
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters<br />
+ * In: Proceedings of the 17th International Conference on Machine Learning
+ * (ICML 2000)
+ * </p>
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <M> Model type
+ */
+@Reference(authors = "D. Pelleg, A. Moore", //
+booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", //
+title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", //
+url = "http://www.pelleg.org/shared/hp/download/xmeans.ps")
+public class XMeans<V extends NumberVector, M extends MeanModel> extends AbstractKMeans<V, M> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(XMeans.class);
+
+ /**
+ * Inner k-means algorithm.
+ */
+ private KMeans<V, M> innerKMeans;
+
+ /**
+ * Effective number of clusters, minimum and maximum.
+ */
+ private int k, k_min, k_max;
+
+ /**
+ * Initializer for k-means.
+ */
+ PredefinedInitialMeans splitInitializer;
+
+ /**
+ * Information criterion to choose the better split.
+ */
+ KMeansQualityMeasure<V> informationCriterion;
+
+ /**
+ * Random factory.
+ */
+ RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function
+ * @param k_min k_min parameter - minimum number of result clusters
+ * @param k_max k_max parameter - maximum number of result clusters
+ * @param maxiter Maximum number of iterations each.
+ * @param innerKMeans K-Means variant to use inside.
+ * @param informationCriterion The information criterion used for the
+ * splitting step
+ * @param random Random factory
+ */
+ public XMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k_min, int k_max, int maxiter, KMeans<V, M> innerKMeans, KMeansInitialization<? super V> initializer, PredefinedInitialMeans splitInitializer, KMeansQualityMeasure<V> informationCriterion, RandomFactory random) {
+ super(distanceFunction, k_min, maxiter, initializer);
+ this.k_min = k_min;
+ this.k_max = k_max;
+ this.k = k_min;
+ this.innerKMeans = innerKMeans;
+ this.splitInitializer = splitInitializer;
+ this.informationCriterion = informationCriterion;
+ this.rnd = random;
+ }
+
+ /**
+ * Run the algorithm on a database and relation.
+ *
+ * @param database Database to process
+ * @param relation Data relation
+ * @return Clustering result.
+ */
+ @Override
+ public Clustering<M> run(Database database, Relation<V> relation) {
+ MutableProgress prog = LOG.isVerbose() ? new MutableProgress("X-means number of clusters", k_max, LOG) : null;
+
+ // Run initial k-means to find at least k_min clusters
+ innerKMeans.setK(k_min);
+ splitInitializer.setInitialMeans(initializer.chooseInitialMeans(database, relation, k_min, getDistanceFunction(), Vector.FACTORY));
+ Clustering<M> clustering = innerKMeans.run(database, relation);
+
+ if(prog != null) {
+ prog.setProcessed(k_min, LOG);
+ }
+
+ ArrayList<Cluster<M>> clusters = new ArrayList<>(clustering.getAllClusters());
+ while(clusters.size() <= k_max) {
+ // Improve-Structure:
+ ArrayList<Cluster<M>> nextClusters = new ArrayList<>();
+ for(Cluster<M> cluster : clusters) {
+ // Try to split this cluster:
+ List<Cluster<M>> childClusterList = splitCluster(cluster, database, relation);
+ nextClusters.addAll(childClusterList);
+ if(childClusterList.size() > 1) {
+ k += childClusterList.size() - 1;
+ if(prog != null) {
+ if(k >= k_max) {
+ prog.setTotal(k + 1);
+ }
+ prog.setProcessed(k, LOG);
+ }
+ }
+ }
+ if(clusters.size() == nextClusters.size()) {
+ break;
+ }
+ // Improve-Params:
+ splitInitializer.setInitialClusters(nextClusters);
+ innerKMeans.setK(nextClusters.size());
+ clustering = innerKMeans.run(database, relation);
+ clusters.clear();
+ clusters.addAll(clustering.getAllClusters());
+ }
+
+ // Ensure that the progress bar finished.
+ if(prog != null) {
+ prog.setTotal(k);
+ prog.setProcessed(k, LOG);
+ }
+
+ if(LOG.isDebugging()) {
+ LOG.debug("X-means returned k=" + k + " clusters.");
+ }
+
+ // add all current clusters to the result
+ Clustering<M> result = new Clustering<>("X-Means Result", "X-Means", clusters);
+ return result;
+ }
+
+ /**
+ * Conditionally splits the clusters based on the information criterion.
+ *
+ * @param parentCluster Cluster to split
+ * @param database Database
+ * @param relation Data relation
+ * @return Parent cluster when split decreases clustering quality or child
+ * clusters when split improves clustering.
+ */
+ protected List<Cluster<M>> splitCluster(Cluster<M> parentCluster, Database database, Relation<V> relation) {
+ // Transform parent cluster into a clustering
+ ArrayList<Cluster<M>> parentClusterList = new ArrayList<Cluster<M>>(1);
+ parentClusterList.add(parentCluster);
+ Clustering<M> parentClustering = new Clustering<>(parentCluster.getName(), parentCluster.getName(), parentClusterList);
+
+ if(parentCluster.size() < 2) {
+ // Split is not possbile
+ return parentClusterList;
+ }
+
+ ProxyDatabase proxyDB = new ProxyDatabase(parentCluster.getIDs(), database);
+
+ splitInitializer.setInitialMeans(splitCentroid(parentCluster, relation));
+ innerKMeans.setK(2);
+ Clustering<M> childClustering = innerKMeans.run(proxyDB);
+
+ double parentEvaluation = informationCriterion.quality(parentClustering, getDistanceFunction(), relation);
+ double childrenEvaluation = informationCriterion.quality(childClustering, getDistanceFunction(), relation);
+
+ if(LOG.isDebugging()) {
+ LOG.debug("parentEvaluation: " + parentEvaluation);
+ LOG.debug("childrenEvaluation: " + childrenEvaluation);
+ }
+
+ // Check if split is an improvement:
+ return (childrenEvaluation > parentEvaluation) ^ informationCriterion.ascending() ? parentClusterList : childClustering.getAllClusters();
+ }
+
+ /**
+ * Split an existing centroid into two initial centers.
+ *
+ * @param parentCluster Existing cluster
+ * @param relation Data relation
+ * @return List of new centroids
+ */
+ protected List<? extends NumberVector> splitCentroid(Cluster<? extends MeanModel> parentCluster, Relation<V> relation) {
+ Vector parentCentroid = parentCluster.getModel().getMean();
+
+ // Compute size of cluster/region
+ double radius = 0.;
+ for(DBIDIter it = parentCluster.getIDs().iter(); it.valid(); it.advance()) {
+ double d = getDistanceFunction().distance(relation.get(it), parentCentroid);
+ radius = (d > radius) ? d : radius;
+ }
+
+ // Choose random vector
+ Random random = rnd.getSingleThreadedRandom();
+ final int dim = RelationUtil.dimensionality(relation);
+ Vector randomVector = VectorUtil.randomVector(Vector.FACTORY, dim, random).normalize();
+ randomVector.timesEquals((.4 + random.nextDouble() * .5) * radius);
+
+ // Get the new centroids
+ ArrayList<Vector> vecs = new ArrayList<>(2);
+ vecs.add(parentCentroid.minus(randomVector));
+ vecs.add(randomVector.plusEquals(parentCentroid));
+ return vecs;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return innerKMeans.getInputTypeRestriction();
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ * @param <M> Model type of inner algorithm
+ */
+ public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractKMeans.Parameterizer<V> {
+ /**
+ * Parameter to specify the kMeans variant.
+ */
+ public static final OptionID INNER_KMEANS_ID = new OptionID("xmeans.kmeans", "kMeans algorithm to use.");
+
+ /**
+ * Minimum number of clusters.
+ */
+ public static final OptionID K_MIN_ID = new OptionID("xmeans.k_min", "The minimum number of clusters to find.");
+
+ /**
+ * Randomization seed.
+ */
+ public static final OptionID SEED_ID = new OptionID("xmeans.seed", "Random seed for splitting clusters.");
+
+ /**
+ * Quality measure to use for evaluating splits.
+ */
+ public static final OptionID INFORMATION_CRITERION_ID = new OptionID("xmeans.quality", "The quality measure to evaluate splits (e.g. AIC, BIC)");
+
+ /**
+ * Variant of kMeans
+ */
+ protected KMeans<V, M> innerKMeans;
+
+ /**
+ * Class to feed splits to the internal k-means algorithm.
+ */
+ protected PredefinedInitialMeans splitInitializer;
+
+ /**
+ * Information criterion.
+ */
+ protected KMeansQualityMeasure<V> informationCriterion;
+
+ /**
+ * Minimum and maximum number of result clusters.
+ */
+ protected int k_min, k_max;
+
+ /**
+ * Random number generator.
+ */
+ private RandomFactory random;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // Do NOT invoke super.makeOptions to hide the "k" parameter.
+ IntParameter kMinP = new IntParameter(K_MIN_ID, 2) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kMinP)) {
+ k_min = kMinP.intValue();
+ }
+ IntParameter kMaxP = new IntParameter(KMeans.K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kMaxP)) {
+ k_max = kMaxP.intValue();
+ }
+ // We allow k_min = k_max.
+ config.checkConstraint(new LessEqualGlobalConstraint<>(kMinP, kMaxP));
+ getParameterInitialization(config);
+ getParameterMaxIter(config);
+ getParameterDistanceFunction(config);
+
+ RandomParameter rndP = new RandomParameter(SEED_ID);
+ if(config.grab(rndP)) {
+ random = rndP.getValue();
+ }
+ splitInitializer = new PredefinedInitialMeans((List<Vector>) null);
+
+ ObjectParameter<KMeans<V, M>> innerKMeansP = new ObjectParameter<>(INNER_KMEANS_ID, KMeans.class, KMeansLloyd.class);
+ if(config.grab(innerKMeansP)) {
+ ListParameterization initialKMeansVariantParameters = new ListParameterization();
+ initialKMeansVariantParameters.addParameter(KMeans.K_ID, k_min);
+ initialKMeansVariantParameters.addParameter(KMeans.INIT_ID, splitInitializer);
+ initialKMeansVariantParameters.addParameter(KMeans.MAXITER_ID, maxiter);
+ initialKMeansVariantParameters.addParameter(KMeans.DISTANCE_FUNCTION_ID, distanceFunction);
+ ChainedParameterization combinedConfig = new ChainedParameterization(initialKMeansVariantParameters, config);
+ combinedConfig.errorsTo(config);
+ innerKMeans = innerKMeansP.instantiateClass(combinedConfig);
+ }
+
+ ObjectParameter<KMeansQualityMeasure<V>> informationCriterionP = new ObjectParameter<>(INFORMATION_CRITERION_ID, KMeansQualityMeasure.class);
+ if(config.grab(informationCriterionP)) {
+ informationCriterion = informationCriterionP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected XMeans<V, M> makeInstance() {
+ return new XMeans<V, M>(distanceFunction, k_min, k_max, maxiter, innerKMeans, initializer, splitInitializer, informationCriterion, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java
index 9e3eb478..43c59c65 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -22,7 +22,9 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -34,7 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @param <V> Vector type
*/
-public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> {
+public abstract class AbstractKMeansInitialization<V extends NumberVector> implements KMeansInitialization<V> {
/**
* Random number generator
*/
@@ -56,7 +58,7 @@ public abstract class AbstractKMeansInitialization<V> implements KMeansInitializ
*
* @apiviz.exclude
*/
- public abstract static class Parameterizer<V> extends AbstractParameterizer {
+ public abstract static class Parameterizer extends AbstractParameterizer {
/**
* Random generator
*/
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java
index 9edfd816..599c3975 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java
@@ -1,183 +1,199 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-import java.util.ArrayList;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
-import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
-import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
-
-/**
- * K-Means initialization by repeatedly choosing the farthest point.
- *
- * Note: this is less random than other initializations, so running multiple
- * times will be more likely to return the same local minima.
- *
- * @author Erich Schubert
- *
- * @param <V> Vector type
- * @param <D> Distance type
- */
-public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
- /**
- * Discard the first vector.
- */
- boolean dropfirst = true;
-
- /**
- * Constructor.
- *
- * @param rnd Random generator.
- * @param dropfirst Flag to discard the first vector.
- */
- public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) {
- super(rnd);
- this.dropfirst = dropfirst;
- }
-
- @Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
- // Get a distance query
- if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
- }
- @SuppressWarnings("unchecked")
- final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
- DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF);
-
- // Chose first mean
- List<V> means = new ArrayList<>(k);
-
- DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter();
- means.add(relation.get(first));
-
- DBIDVar best = DBIDUtil.newVar(first);
- for(int i = (dropfirst ? 0 : 1); i < k; i++) {
- // Find farthest object:
- double maxdist = Double.NEGATIVE_INFINITY;
- for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
- double dsum = 0.;
- for(V ex : means) {
- dsum += distQ.distance(ex, it).doubleValue();
- }
- if(dsum > maxdist) {
- maxdist = dsum;
- best.set(it);
- }
- }
- // Add new mean:
- if(k == 0) {
- means.clear(); // Remove temporary first element.
- }
- means.add(relation.get(best));
- }
-
- return means;
- }
-
- @Override
- public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
- if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
- }
- @SuppressWarnings("unchecked")
- DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
- final Relation<?> relation = distQ.getRelation();
- // Chose first mean
- ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
-
- DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter();
- means.add(first);
-
- DBIDVar best = DBIDUtil.newVar(first);
- for(int i = (dropfirst ? 0 : 1); i < k; i++) {
- // Find farthest object:
- double maxdist = Double.NEGATIVE_INFINITY;
- for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
- double dsum = 0.;
- for(DBIDIter ex = means.iter(); ex.valid(); ex.advance()) {
- dsum += distQ.distance(ex, it).doubleValue();
- }
- if(dsum > maxdist) {
- maxdist = dsum;
- best.set(it);
- }
- }
- // Add new mean:
- if(k == 0) {
- means.clear(); // Remove temporary first element.
- }
- means.add(best);
- }
-
- return means;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
- /**
- * Option ID to control the handling of the first object chosen.
- */
- public static final OptionID DROPFIRST_ID = new OptionID("farthest.dropfirst", "Drop the first object chosen (which is chosen randomly) for the farthest points heuristic.");
-
- /**
- * Flag for discarding the first object chosen.
- */
- protected boolean dropfirst = true;
-
- @Override
- protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
- Flag dropfirstP = new Flag(DROPFIRST_ID);
- if(config.grab(dropfirstP)) {
- dropfirst = dropfirstP.isTrue();
- }
- }
-
- @Override
- protected FarthestPointsInitialMeans<V, D> makeInstance() {
- return new FarthestPointsInitialMeans<>(rnd, dropfirst);
- }
- }
-}
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+
+/**
+ * K-Means initialization by repeatedly choosing the farthest point (by the
+ * <em>minimum</em> distance to earlier points).
+ *
+ * Note: this is less random than other initializations, so running multiple
+ * times will be more likely to return the same local minima.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type for kMedoids and kMedians
+ */
+public class FarthestPointsInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
+ /**
+ * Discard the first vector.
+ */
+ boolean dropfirst = true;
+
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ * @param dropfirst Flag to discard the first vector.
+ */
+ public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) {
+ super(rnd);
+ this.dropfirst = dropfirst;
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ // Get a distance query
+ DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction);
+
+ DBIDs ids = relation.getDBIDs();
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.POSITIVE_INFINITY);
+
+ // Chose first mean
+ List<V> means = new ArrayList<>(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ T prevmean = relation.get(first);
+ means.add(factory.newNumberVector(prevmean));
+
+ // Find farthest object each.
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double val = Math.min(prev, distQ.distance(prevmean, it));
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, val);
+ }
+ if(val > maxdist) {
+ maxdist = val;
+ best.set(it);
+ }
+ }
+ // Add new mean (and drop the initial mean when desired)
+ if(i == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(factory.newNumberVector(prevmean));
+ }
+
+ // Explicitly destroy temporary data.
+ store.destroy();
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
+ @SuppressWarnings("unchecked")
+ final Relation<O> relation = (Relation<O>) distQ.getRelation();
+
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.POSITIVE_INFINITY);
+
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ means.add(first);
+ O prevmean = relation.get(first);
+
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ // Find farthest object:
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double val = Math.min(prev, distQ.distance(prevmean, it));
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, val);
+ }
+ if(val > maxdist) {
+ maxdist = val;
+ best.set(it);
+ }
+ }
+ // Add new mean:
+ if(i == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(best);
+ }
+
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O> extends AbstractKMeansInitialization.Parameterizer {
+ /**
+ * Option ID to control the handling of the first object chosen.
+ */
+ public static final OptionID KEEPFIRST_ID = new OptionID("farthest.keepfirst", "Keep the first object chosen (which is chosen randomly) for the farthest points heuristic.");
+
+ /**
+ * Flag for discarding the first object chosen.
+ */
+ protected boolean keepfirst = false;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ Flag dropfirstP = new Flag(KEEPFIRST_ID);
+ if(config.grab(dropfirstP)) {
+ keepfirst = dropfirstP.isTrue();
+ }
+ }
+
+ @Override
+ protected FarthestPointsInitialMeans<O> makeInstance() {
+ return new FarthestPointsInitialMeans<>(rnd, !keepfirst);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java
new file mode 100644
index 00000000..c6b02e67
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java
@@ -0,0 +1,176 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+
+/**
+ * K-Means initialization by repeatedly choosing the farthest point (by the
+ * <em>sum</em> of distances to previous objects).
+ *
+ * Note: this is less random than other initializations, so running multiple
+ * times will be more likely to return the same local minima.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type for kmedoids and kmedians
+ */
+public class FarthestSumPointsInitialMeans<O> extends FarthestPointsInitialMeans<O> {
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ * @param dropfirst Flag to discard the first vector.
+ */
+ public FarthestSumPointsInitialMeans(RandomFactory rnd, boolean dropfirst) {
+ super(rnd, dropfirst);
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ // Get a distance query
+ DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction);
+
+ DBIDs ids = relation.getDBIDs();
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
+ // Chose first mean
+ List<V> means = new ArrayList<>(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ T prevmean = relation.get(first);
+ means.add(factory.newNumberVector(prevmean));
+
+ // Find farthest object each.
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double dsum = prev + distQ.distance(prevmean, it);
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, dsum);
+ }
+ if(dsum > maxdist) {
+ maxdist = dsum;
+ best.set(it);
+ }
+ }
+ // Add new mean (and drop the initial mean when desired)
+ if(i == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(factory.newNumberVector(prevmean));
+ }
+
+ // Explicitly destroy temporary data.
+ store.destroy();
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
+ @SuppressWarnings("unchecked")
+ final Relation<O> relation = (Relation<O>) distQ.getRelation();
+
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ means.add(first);
+ O prevmean = relation.get(first);
+
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ // Find farthest object:
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double dsum = prev + distQ.distance(prevmean, it);
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, dsum);
+ }
+ if(dsum > maxdist) {
+ maxdist = dsum;
+ best.set(it);
+ }
+ }
+ // Add new mean:
+ if(k == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(best);
+ }
+
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V> extends FarthestPointsInitialMeans.Parameterizer<V> {
+ /**
+ * Flag for discarding the first object chosen.
+ */
+ protected boolean keepfirst = false;
+
+ @Override
+ protected FarthestSumPointsInitialMeans<V> makeInstance() {
+ return new FarthestSumPointsInitialMeans<>(rnd, !keepfirst);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java
index 08e2f116..af5a7717 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java
@@ -1,87 +1,87 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-import java.util.ArrayList;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
-import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-
-/**
- * Initialize K-means by using the first k objects as initial means.
- *
- * @author Erich Schubert
- *
- * @param <V> Vector type
- */
-public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsInitialization<V> {
- /**
- * Constructor.
- */
- public FirstKInitialMeans() {
- super();
- }
-
- @Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
- DBIDIter iter = relation.iterDBIDs();
- List<V> means = new ArrayList<>(k);
- for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
- means.add(relation.get(iter));
- }
- return means;
- }
-
- @Override
- public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) {
- DBIDIter iter = distanceFunction.getRelation().iterDBIDs();
- ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
- for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
- means.add(iter);
- }
- return means;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
- @Override
- protected FirstKInitialMeans<V> makeInstance() {
- return new FirstKInitialMeans<>();
- }
- }
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Initialize K-means by using the first k objects as initial means.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type for KMedoids
+ */
+public class FirstKInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> {
+ /**
+ * Constructor.
+ */
+ public FirstKInitialMeans() {
+ super();
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ DBIDIter iter = relation.iterDBIDs();
+ List<V> means = new ArrayList<>(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(factory.newNumberVector(relation.get(iter)));
+ }
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distanceFunction) {
+ DBIDIter iter = ids.iter();
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(iter);
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ @Override
+ protected FirstKInitialMeans<V> makeInstance() {
+ return new FirstKInitialMeans<>();
+ }
+ }
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java
index 06fb10c1..e87b9d14 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -36,9 +36,9 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
*
* @apiviz.landmark
*
- * @param <V> Object type
+ * @param <V> Vector type
*/
-public interface KMeansInitialization<V> {
+public interface KMeansInitialization<V extends NumberVector> {
/**
* Choose initial means
*
@@ -46,8 +46,10 @@ public interface KMeansInitialization<V> {
* @param relation Relation
* @param k Parameter k
* @param distanceFunction Distance function
- *
+ * @param factory Factory for output vectors.
+ * @param <T> Input vector type
+ * @param <O> Output vector type
* @return List of chosen means for k-means
*/
- public abstract List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction);
+ public abstract <T extends V, O extends NumberVector> List<O> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<O> factory);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java
index 6fc514eb..977a4182 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,21 +28,21 @@ import java.util.Random;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
-import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
/**
* K-Means++ initialization for k-means.
@@ -57,11 +57,10 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
*
* @author Erich Schubert
*
- * @param <V> Vector type
- * @param <D> Distance type
+ * @param <O> Vector type
*/
@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494")
-public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
+public class KMeansPlusPlusInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
/**
* Constructor.
*
@@ -72,25 +71,20 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
}
@Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
- // Get a distance query
- if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("K-Means++ initialization can only be used with numerical distances.");
- }
- @SuppressWarnings("unchecked")
- final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
- DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF);
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction);
+
+ DBIDs ids = relation.getDBIDs();
+ WritableDoubleDataStore weights = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
// Chose first mean
List<V> means = new ArrayList<>(k);
Random random = rnd.getSingleThreadedRandom();
- DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, random).iter());
- means.add(relation.get(first));
+ DBID first = DBIDUtil.deref(DBIDUtil.randomSample(ids, 1, random).iter());
+ means.add(factory.newNumberVector(relation.get(first)));
- ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
// Initialize weights
- double[] weights = new double[ids.size()];
double weightsum = initialWeights(weights, ids, first, distQ);
while(means.size() < k) {
if(weightsum > Double.MAX_VALUE) {
@@ -100,47 +94,43 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?");
}
double r = random.nextDouble() * weightsum;
- int pos = 0;
- while(r > 0 && pos < weights.length - 1) {
- r -= weights[pos];
- pos++;
+ DBIDIter it = ids.iter();
+ for(; r > 0. && it.valid(); it.advance()) {
+ double w = weights.doubleValue(it);
+ if(w != w) {
+ continue; // NaN: alrady chosen.
+ }
+ r -= w;
}
// Add new mean:
- DBID newmean = ids.get(pos);
- means.add(relation.get(newmean));
+ final T newmean = relation.get(it);
+ means.add(factory.newNumberVector(newmean));
// Update weights:
- weights[pos] = 0.0;
+ weights.putDouble(it, Double.NaN);
// Choose optimized version for double distances, if applicable.
- if(distF instanceof PrimitiveDoubleDistanceFunction) {
- @SuppressWarnings("unchecked")
- PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF;
- weightsum = updateWeights(weights, ids, newmean, ddist, relation);
- }
- else {
- weightsum = updateWeights(weights, ids, newmean, distQ);
- }
+ weightsum = updateWeights(weights, ids, newmean, distQ);
}
+ // Explicitly destroy temporary data.
+ weights.destroy();
+
return means;
}
@Override
- public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
- if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("K-Means++ initialization initialization can only be used with numerical distances.");
- }
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
@SuppressWarnings("unchecked")
- DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
- // Chose first mean
+ final Relation<O> rel = (Relation<O>) distQ.getRelation();
+
ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+ WritableDoubleDataStore weights = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
Random random = rnd.getSingleThreadedRandom();
- DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random).iter());
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, random).iter();
means.add(first);
- ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs());
// Initialize weights
- double[] weights = new double[ids.size()];
double weightsum = initialWeights(weights, ids, first, distQ);
while(means.size() < k) {
if(weightsum > Double.MAX_VALUE) {
@@ -150,17 +140,19 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?");
}
double r = random.nextDouble() * weightsum;
- int pos = 0;
- while(r > 0 && pos < weights.length) {
- r -= weights[pos];
- pos++;
+ DBIDIter it = ids.iter();
+ for(; r > 0. && it.valid(); it.advance()) {
+ double w = weights.doubleValue(it);
+ if(w != w) {
+ continue; // NaN: alrady chosen.
+ }
+ r -= w;
}
// Add new mean:
- DBID newmean = ids.get(pos);
- means.add(newmean);
+ means.add(it);
// Update weights:
- weights[pos] = 0.0;
- weightsum = updateWeights(weights, ids, newmean, distQ);
+ weights.putDouble(it, Double.NaN);
+ weightsum = updateWeights(weights, ids, rel.get(it), distQ);
}
return means;
@@ -175,18 +167,13 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
* @param distQ Distance query
* @return Weight sum
*/
- protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) {
- double weightsum = 0.0;
- DBIDIter it = ids.iter();
- for(int i = 0; i < weights.length; i++, it.advance()) {
- if(DBIDUtil.equal(latest, it)) {
- weights[i] = 0.0;
- }
- else {
- double d = distQ.distance(latest, it).doubleValue();
- weights[i] = d * d;
- }
- weightsum += weights[i];
+ protected double initialWeights(WritableDoubleDataStore weights, DBIDs ids, DBIDRef latest, DistanceQuery<?> distQ) {
+ double weightsum = 0.;
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ // Distance will usually already be squared
+ double weight = distQ.distance(latest, it);
+ weights.putDouble(it, weight);
+ weightsum += weight;
}
return weightsum;
}
@@ -200,38 +187,19 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
* @param distQ Distance query
* @return Weight sum
*/
- protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) {
- double weightsum = 0.0;
- DBIDIter it = ids.iter();
- for(int i = 0; i < weights.length; i++, it.advance()) {
- if(weights[i] > 0.0) {
- double d = distQ.distance(latest, it).doubleValue();
- weights[i] = Math.min(weights[i], d * d);
- weightsum += weights[i];
+ protected <T> double updateWeights(WritableDoubleDataStore weights, DBIDs ids, T latest, DistanceQuery<? super T> distQ) {
+ double weightsum = 0.;
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ double weight = weights.doubleValue(it);
+ if(weight != weight) {
+ continue; // NaN: already chosen!
}
- }
- return weightsum;
- }
-
- /**
- * Update the weight list.
- *
- * @param weights Weight list
- * @param ids IDs
- * @param latest Added ID
- * @param distF Distance function
- * @return Weight sum
- */
- protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, PrimitiveDoubleDistanceFunction<V> distF, Relation<V> rel) {
- final V lv = rel.get(latest);
- double weightsum = 0.0;
- DBIDIter it = ids.iter();
- for(int i = 0; i < weights.length; i++, it.advance()) {
- if(weights[i] > 0.0) {
- double d = distF.doubleDistance(lv, rel.get(it));
- weights[i] = Math.min(weights[i], d * d);
- weightsum += weights[i];
+ double newweight = distQ.distance(latest, it);
+ if(newweight < weight) {
+ weights.putDouble(it, newweight);
+ weight = newweight;
}
+ weightsum += weight;
}
return weightsum;
}
@@ -243,9 +211,9 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer {
@Override
- protected KMeansPlusPlusInitialMeans<V, D> makeInstance() {
+ protected KMeansPlusPlusInitialMeans<V> makeInstance() {
return new KMeansPlusPlusInitialMeans<>(rnd);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java
index 136a4129..9ae59961 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java
@@ -1,9 +1,9 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -38,8 +38,9 @@ public interface KMedoidsInitialization<V> {
* Choose initial means
*
* @param k Parameter k
+ * @param ids Candidate IDs.
* @param distanceFunction Distance function
* @return List of chosen means for k-means
*/
- public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction);
+ public abstract DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super V> distanceFunction);
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java
index c7e1751f..812ea4ab 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -38,7 +38,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.math.Mean;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -58,11 +57,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*
- * @param <V> Vector type
- * @param <D> Distance type
+ * @param <O> Object type for KMedoids initialization
*/
-@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
-public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMeansInitialization<V>, KMedoidsInitialization<V> {
+@Reference(title = "Clustering my means of Medoids", //
+authors = "Kaufman, L. and Rousseeuw, P.J.", //
+booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
+public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> {
/**
* Constructor.
*/
@@ -71,31 +71,24 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean
}
@Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ // Ugly cast; but better than code duplication.
+ @SuppressWarnings("unchecked")
+ Relation<O> rel = (Relation<O>) relation;
// Get a distance query
- if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("PAM initialization can only be used with numerical distances.");
- }
@SuppressWarnings("unchecked")
- final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
- final DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF);
- DBIDs medids = chooseInitialMedoids(k, distQ);
+ final PrimitiveDistanceFunction<? super O> distF = (PrimitiveDistanceFunction<? super O>) distanceFunction;
+ final DistanceQuery<O> distQ = database.getDistanceQuery(rel, distF);
+ DBIDs medids = chooseInitialMedoids(k, rel.getDBIDs(), distQ);
List<V> medoids = new ArrayList<>(k);
for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) {
- medoids.add(relation.get(iter));
+ medoids.add(factory.newNumberVector(relation.get(iter)));
}
return medoids;
}
@Override
- public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
- if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("PAM initialization can only be used with numerical distances.");
- }
- @SuppressWarnings("unchecked")
- DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
- final DBIDs ids = distQ.getRelation().getDBIDs();
-
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
double best = Double.POSITIVE_INFINITY;
Mean mean = new Mean(); // Mean is numerically more stable than sum.
@@ -109,7 +102,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean
WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
mean.reset();
for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
- double d = distQ.distance(iter, iter2).doubleValue();
+ double d = distQ.distance(iter, iter2);
mean.put(d);
newd.putDouble(iter2, d);
}
@@ -141,7 +134,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean
WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
mean.reset();
for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
- double dn = distQ.distance(iter, iter2).doubleValue();
+ double dn = distQ.distance(iter, iter2);
double v = Math.min(dn, mindist.doubleValue(iter2));
mean.put(v);
newd.put(iter2, v);
@@ -178,9 +171,9 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean
*
* @apiviz.exclude
*/
- public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ public static class Parameterizer<V> extends AbstractParameterizer {
@Override
- protected PAMInitialMeans<V, D> makeInstance() {
+ protected PAMInitialMeans<V> makeInstance() {
return new PAMInitialMeans<>();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java
new file mode 100644
index 00000000..01df56ae
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java
@@ -0,0 +1,161 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.NumberVector.Factory;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.VectorListParameter;
+
+/**
+ * Run k-means with prespecified initial means.
+ *
+ * @author Erich Schubert
+ */
+public class PredefinedInitialMeans extends AbstractKMeansInitialization<NumberVector> {
+ /**
+ * Initial means to return.
+ */
+ List<? extends NumberVector> initialMeans;
+
+ /**
+ * Constructor.
+ *
+ * @param initialMeans Initial means
+ */
+ public PredefinedInitialMeans(List<? extends NumberVector> initialMeans) {
+ super(null);
+ this.setInitialMeans(initialMeans);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param initialMeans Initial means
+ */
+ public PredefinedInitialMeans(double[][] initialMeans) {
+ super(null);
+ this.setInitialMeans(initialMeans);
+ }
+
+ /**
+ * Set the initial means.
+ *
+ * Important notice: Use with care - the means are <em>not copied</em>!
+ *
+ * @param initialMeans initial means.
+ */
+ public void setInitialMeans(List<? extends NumberVector> initialMeans) {
+ this.initialMeans = initialMeans;
+ }
+
+ /**
+ * Set the initial means.
+ *
+ * Important notice: Use with care - the means are <em>not copied</em>!
+ *
+ * @param initialMeans initial means.
+ */
+ public void setInitialClusters(List<? extends Cluster<? extends MeanModel>> initialMeans) {
+ List<Vector> vecs = new ArrayList<>(initialMeans.size());
+ for(Cluster<? extends MeanModel> cluster : initialMeans) {
+ vecs.add(cluster.getModel().getMean().copy());
+ }
+ this.initialMeans = vecs;
+ }
+
+ /**
+ * Set the initial means.
+ *
+ * Important notice: Use with care - the means are <em>not copied</em>!
+ *
+ * @param initialMeans initial means.
+ */
+ public void setInitialMeans(double[][] initialMeans) {
+ List<Vector> vecs = new ArrayList<>(initialMeans.length);
+ for(int i = 0; i < initialMeans.length; ++i) {
+ vecs.add(new Vector(initialMeans[i]));
+ }
+ this.initialMeans = vecs;
+ }
+
+ @Override
+ public <T extends NumberVector, O extends NumberVector> List<O> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, Factory<O> factory) {
+ if(k != initialMeans.size()) {
+ throw new AbortException("Predefined initial means contained " + initialMeans.size() + " means, algorithm requested " + k + " means instead.");
+ }
+ // Chose first mean
+ List<O> means = new ArrayList<>(k);
+
+ for(NumberVector v : initialMeans) {
+ means.add(factory.newNumberVector(v));
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Option to specify the initial means to use.
+ */
+ public static final OptionID INITIAL_MEANS = new OptionID("kmeans.means", "Initial means for k-means.");
+
+ /**
+ * Initial means.
+ */
+ protected List<Vector> initialMeans;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ VectorListParameter meansP = new VectorListParameter(INITIAL_MEANS);
+ if(config.grab(meansP)) {
+ initialMeans = meansP.getValue();
+ }
+ }
+
+ @Override
+ protected PredefinedInitialMeans makeInstance() {
+ return new PredefinedInitialMeans(initialMeans);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java
index 214f4ce6..fe6b7281 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java
@@ -1,84 +1,84 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-import java.util.ArrayList;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
-import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
-
-/**
- * Initialize K-means by randomly choosing k exsiting elements as cluster
- * centers.
- *
- * @author Erich Schubert
- *
- * @param <V> Vector type
- */
-public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
- /**
- * Constructor.
- *
- * @param rnd Random generator.
- */
- public RandomlyChosenInitialMeans(RandomFactory rnd) {
- super(rnd);
- }
-
- @Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
- DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd);
- List<V> means = new ArrayList<>(k);
- for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- means.add(relation.get(iter));
- }
- return means;
- }
-
- @Override
- public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) {
- return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, rnd);
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> {
- @Override
- protected RandomlyChosenInitialMeans<V> makeInstance() {
- return new RandomlyChosenInitialMeans<>(rnd);
- }
- }
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+
+/**
+ * Initialize K-means by randomly choosing k exsiting elements as cluster
+ * centers.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Vector type
+ */
+public class RandomlyChosenInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ */
+ public RandomlyChosenInitialMeans(RandomFactory rnd) {
+ super(rnd);
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd);
+ List<V> means = new ArrayList<>(k);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ means.add(factory.newNumberVector(relation.get(iter)));
+ }
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distanceFunction) {
+ return DBIDUtil.randomSample(ids, k, rnd);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer {
+ @Override
+ protected RandomlyChosenInitialMeans<V> makeInstance() {
+ return new RandomlyChosenInitialMeans<>(rnd);
+ }
+ }
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java
index 1329132e..4f7a21a0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -32,19 +32,15 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.math.MathUtil;
-import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
-import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
/**
* Initialize k-means by generating random vectors (within the data sets value
* range).
*
* @author Erich Schubert
- *
- * @param <V> Vector type
*/
-public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends AbstractKMeansInitialization<V> {
+public class RandomlyGeneratedInitialMeans extends AbstractKMeansInitialization<NumberVector> {
/**
* Constructor.
*
@@ -55,17 +51,20 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab
}
@Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
final int dim = RelationUtil.dimensionality(relation);
- NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
+ double[][] minmax = RelationUtil.computeMinMax(relation);
+ double[] min = minmax[0], scale = minmax[1];
+ for(int d = 0; d < dim; d++) {
+ scale[d] = scale[d] - min[d];
+ }
List<V> means = new ArrayList<>(k);
final Random random = rnd.getSingleThreadedRandom();
for(int i = 0; i < k; i++) {
double[] r = MathUtil.randomDoubleArray(dim, random);
// Rescale
for(int d = 0; d < dim; d++) {
- r[d] = minmax.first.doubleValue(d) + (minmax.second.doubleValue(d) - minmax.first.doubleValue(d)) * r[d];
+ r[d] = min[d] + scale[d] * r[d];
}
means.add(factory.newNumberVector(r));
}
@@ -79,10 +78,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer extends AbstractKMeansInitialization.Parameterizer {
@Override
- protected RandomlyGeneratedInitialMeans<V> makeInstance() {
- return new RandomlyGeneratedInitialMeans<>(rnd);
+ protected RandomlyGeneratedInitialMeans makeInstance() {
+ return new RandomlyGeneratedInitialMeans(rnd);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java
index 79013364..bd1bc5a8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,10 +26,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.model.ModelUtil;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
@@ -38,8 +40,8 @@ import de.lmu.ifi.dbs.elki.database.relation.ProxyView;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
@@ -54,11 +56,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> Vector type
*/
-public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization<V> {
+public class SampleKMeansInitialization<V extends NumberVector> extends AbstractKMeansInitialization<V> {
/**
- * Variant of kMeans for the bisecting step.
+ * Variant of kMeans to use for initialization.
*/
- private KMeans<V, D, ?> innerkMeans;
+ private KMeans<V, ?> innerkMeans;
/**
* Sample size.
@@ -72,28 +74,36 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
* @param innerkMeans Inner k-means algorithm.
* @param rate Sampling rate.
*/
- public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, D, ?> innerkMeans, double rate) {
+ public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, ?> innerkMeans, double rate) {
super(rnd);
this.innerkMeans = innerkMeans;
this.rate = rate;
}
@Override
- public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
+ public <T extends V, O extends NumberVector> List<O> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<O> factory) {
final int samplesize = (int) Math.ceil(rate * relation.size());
final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), samplesize, rnd);
- ProxyView<V> proxyv = new ProxyView<>(database, sample, relation);
+ // Ugly cast, sorry
+ @SuppressWarnings("unchecked")
+ Relation<V> rel = (Relation<V>) relation;
+ // FIXME: This does not necessarily hold. Check and fail!
+ if(!distanceFunction.getInputTypeRestriction().isAssignableFromType(TypeUtil.NUMBER_VECTOR_FIELD)) {
+ LoggingUtil.warning("Initializing k-means with k-means using specialized distance functions MAY fail, if the initialization method does require a distance defined on arbitrary number vectors.");
+ }
+ @SuppressWarnings("unchecked")
+ PrimitiveDistanceFunction<? super NumberVector> pdf = (PrimitiveDistanceFunction<? super NumberVector>) distanceFunction;
+ ProxyView<V> proxyv = new ProxyView<>(database, sample, rel);
ProxyDatabase proxydb = new ProxyDatabase(sample, proxyv);
innerkMeans.setK(k);
- @SuppressWarnings("unchecked")
- PrimitiveDistanceFunction<? super NumberVector<?>, D> df = (PrimitiveDistanceFunction<? super NumberVector<?>, D>) distanceFunction;
- innerkMeans.setDistanceFunction(df);
- Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv);
- List<V> means = new ArrayList<>();
- for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) {
- means.add(cluster.getModel().getMean());
+ innerkMeans.setDistanceFunction(pdf);
+ Clustering<?> clusters = innerkMeans.run(proxydb, proxyv);
+
+ List<O> means = new ArrayList<>();
+ for(Cluster<?> cluster : clusters.getAllClusters()) {
+ means.add(factory.newNumberVector(ModelUtil.getPrototype(cluster.getModel(), relation)));
}
return means;
@@ -107,9 +117,8 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
* @apiviz.exclude
*
* @param <V> Vector type
- * @param <D> Distance type
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeansInitialization.Parameterizer {
/**
* Parameter to specify the kMeans variant.
*/
@@ -123,8 +132,8 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
/**
* Inner k-means algorithm to use.
*/
- protected KMeans<V, D, ?> innerkMeans;
-
+ protected KMeans<V, ?> innerkMeans;
+
/**
* Sampling rate.
*/
@@ -133,8 +142,8 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- ObjectParameter<KMeans<V, D, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
- if (config.grab(kMeansVariantP)) {
+ ObjectParameter<KMeans<V, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
+ if(config.grab(kMeansVariantP)) {
ListParameterization kMeansVariantParameters = new ListParameterization();
// We will always invoke this with k as requested from outside!
@@ -145,15 +154,15 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
combinedConfig.errorsTo(config);
innerkMeans = kMeansVariantP.instantiateClass(combinedConfig);
}
-
+
DoubleParameter sampleP = new DoubleParameter(SAMPLE_ID);
- if (config.grab(sampleP)) {
+ if(config.grab(sampleP)) {
rate = sampleP.doubleValue();
}
}
@Override
- protected SampleKMeansInitialization<V, D> makeInstance() {
+ protected SampleKMeansInitialization<V> makeInstance() {
return new SampleKMeansInitialization<>(rnd, innerkMeans, rate);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java
new file mode 100644
index 00000000..14af1d69
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Initialization strategies for k-means.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
index aa4c3e24..da9bd20e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java
new file mode 100644
index 00000000..38133c6b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java
@@ -0,0 +1,272 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.parallel;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.parallel.Executor;
+import de.lmu.ifi.dbs.elki.parallel.processor.Processor;
+
+/**
+ * Parallel k-means implementation.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has Instance
+ */
+public class KMeansProcessor<V extends NumberVector> implements Processor {
+ /**
+ * Data relation.
+ */
+ Relation<V> relation;
+
+ /**
+ * Distance function.
+ */
+ PrimitiveDistanceFunction<? super NumberVector> distance;
+
+ /**
+ * Assignment storage.
+ */
+ WritableIntegerDataStore assignment;
+
+ /**
+ * Mean vectors.
+ */
+ List<Vector> means;
+
+ /**
+ * Updated cluster centroids
+ */
+ double[][] centroids;
+
+ /**
+ * (Partial) cluster sizes
+ */
+ int[] sizes;
+
+ /**
+ * Variance sum.
+ */
+ double[] varsum;
+
+ /**
+ * Whether the assignment changed during the last iteration.
+ */
+ boolean changed = false;
+
+ /**
+ * Constructor.
+ *
+ * @param relation Data relation
+ * @param distance Distance function
+ * @param assignment Cluster assignment
+ * @param varsum Variance sums
+ */
+ public KMeansProcessor(Relation<V> relation, PrimitiveDistanceFunction<? super NumberVector> distance, WritableIntegerDataStore assignment, double[] varsum) {
+ super();
+ this.distance = distance;
+ this.relation = relation;
+ this.assignment = assignment;
+ this.varsum = varsum;
+ }
+
+ /**
+ * Get the "has changed" value.
+ *
+ * @return Changed flag.
+ */
+ public boolean changed() {
+ return changed;
+ }
+
+ /**
+ * Initialize for a new iteration.
+ *
+ * @param means New means.
+ */
+ public void nextIteration(List<Vector> means) {
+ this.means = means;
+ changed = false;
+ final int k = means.size();
+ final int dim = means.get(0).getDimensionality();
+ centroids = new double[k][dim];
+ sizes = new int[k];
+ Arrays.fill(varsum, 0.);
+ }
+
+ @Override
+ public Instance instantiate(Executor exectutor) {
+ return new Instance(relation, distance, assignment, means);
+ }
+
+ @Override
+ public void cleanup(Processor.Instance inst) {
+ @SuppressWarnings("unchecked")
+ Instance instance = (Instance) inst;
+ synchronized(this) {
+ changed |= instance.changed;
+ for(int i = 0; i < centroids.length; i++) {
+ int sizeb = instance.sizes[i];
+ if(sizeb == 0) {
+ continue;
+ }
+ int sizea = sizes[i];
+ double sum = sizea + sizeb;
+ double[] cent = centroids[i];
+ if(sizea > 0) {
+ VMath.timesEquals(cent, sizea / sum);
+ }
+ VMath.plusTimesEquals(cent, instance.centroids[i], 1. / sum);
+ sizes[i] += sizeb;
+ VMath.plusEquals(varsum, instance.varsum);
+ }
+ }
+ }
+
+ /**
+ * Get the new means.
+ *
+ * @return New means
+ */
+ public List<Vector> getMeans() {
+ ArrayList<Vector> newmeans = new ArrayList<>(centroids.length);
+ for(int i = 0; i < centroids.length; i++) {
+ if(sizes[i] == 0) {
+ newmeans.add(means.get(i)); // Keep old mean.
+ continue;
+ }
+ newmeans.add(new Vector(centroids[i]));
+ }
+ return newmeans;
+ }
+
+ /**
+ * Instance to process part of the data set, for a single iteration.
+ *
+ * @author Erich Schubert
+ */
+ public class Instance implements Processor.Instance {
+ /**
+ * Data relation.
+ */
+ private Relation<V> relation;
+
+ /**
+ * Distance function.
+ */
+ private PrimitiveDistanceFunction<? super NumberVector> distance;
+
+ /**
+ * Cluster assignment storage.
+ */
+ private WritableIntegerDataStore assignment;
+
+ /**
+ * Current mean vectors.
+ */
+ private Vector[] means;
+
+ /**
+ * Updated cluster centroids
+ */
+ private double[][] centroids;
+
+ /**
+ * (Partial) cluster sizes
+ */
+ private int[] sizes;
+
+ /**
+ * Variance sum.
+ */
+ private double[] varsum;
+
+ /**
+ * Changed flag.
+ */
+ private boolean changed = false;
+
+ /**
+ * Constructor.
+ *
+ * @param relation Data relation
+ * @param distance Distance function
+ * @param assignment Current assignment
+ * @param means Previous mean vectors
+ */
+ public Instance(Relation<V> relation, PrimitiveDistanceFunction<? super NumberVector> distance, WritableIntegerDataStore assignment, List<? extends NumberVector> means) {
+ super();
+ this.relation = relation;
+ this.distance = distance;
+ this.assignment = assignment;
+ final int k = means.size();
+ this.means = new Vector[k];
+ Iterator<? extends NumberVector> iter = means.iterator();
+ for(int i = 0; i < k; i++) {
+ this.means[i] = iter.next().getColumnVector(); // Make local copy!
+ }
+ // Storage for updated means.
+ final int dim = this.means[0].getDimensionality();
+ this.centroids = new double[k][dim];
+ this.sizes = new int[k];
+ this.varsum = new double[k];
+ }
+
+ @Override
+ public void map(DBIDRef id) {
+ final V fv = relation.get(id);
+ // Find minimum:
+ double mindist = Double.POSITIVE_INFINITY;
+ int minIndex = 0;
+ for(int i = 0; i < means.length; i++) {
+ final double dist = distance.distance(fv, means[i]);
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ varsum[minIndex] += mindist;
+ // Update assignment:
+ int prev = assignment.putInt(id, minIndex);
+ // Update changed flag:
+ changed |= (prev != minIndex);
+ double[] cent = centroids[minIndex];
+ for(int d = 0; d < fv.getDimensionality(); d++) {
+ // TODO: improve numerical stability via Kahan summation?
+ cent[d] += fv.doubleValue(d);
+ }
+ ++sizes[minIndex];
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java
new file mode 100644
index 00000000..35262616
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java
@@ -0,0 +1,154 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.parallel;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.AbstractKMeans;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.parallel.ParallelExecutor;
+
+/**
+ * Parallel implementation of k-Means clustering.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has KMeansProcessor
+ *
+ * @param <V> Vector type
+ */
+public class ParallelLloydKMeans<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> {
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function
+ * @param k K parameter
+ */
+ public ParallelLloydKMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Class logger
+ */
+ private static final Logging LOG = Logging.getLogger(ParallelLloydKMeans.class);
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
+ DBIDs ids = relation.getDBIDs();
+
+ // Choose initial means
+ List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
+
+ // Store for current cluster assignment.
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] varsum = new double[k];
+ KMeansProcessor<V> kmm = new KMeansProcessor<>(relation, distanceFunction, assignment, varsum);
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ kmm.nextIteration(means);
+ ParallelExecutor.run(ids, kmm);
+ // Stop if no cluster assignment changed.
+ if (!kmm.changed()) {
+ break;
+ }
+ means = kmm.getMeans();
+ }
+ LOG.setCompleted(prog);
+
+ // Wrap result
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for (int i = 0; i < k; i++) {
+ // TODO: use cluster sizes from kmm!
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ clusters.get(assignment.intValue(iter)).add(iter);
+ }
+
+ Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for (int i = 0; i < clusters.size(); i++) {
+ DBIDs cids = clusters.get(i);
+ if (cids.size() == 0) {
+ continue;
+ }
+ KMeansModel model = new KMeansModel(means.get(i), varsum[i]);
+ result.addToplevelCluster(new Cluster<>(cids, model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected ParallelLloydKMeans<V> makeInstance() {
+ return new ParallelLloydKMeans<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java
new file mode 100644
index 00000000..89941cc3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Parallelized implementations of k-means.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.parallel; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java
new file mode 100644
index 00000000..88279023
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java
@@ -0,0 +1,239 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Base class for evaluating clusterings by information criteria (such as AIC or
+ * BIC). Provides helper functions (e.g. max likelihood calculation) to its
+ * subclasses.
+ *
+ * The use of information-theoretic criteria for evaluating k-means was
+ * popularized by X-means:
+ * <p>
+ * D. Pelleg, A. Moore:<br />
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters<br />
+ * In: Proceedings of the 17th International Conference on Machine Learning
+ * (ICML 2000)
+ * </p>
+ *
+ * A different version of logLikelihood is derived in:
+ * <p>
+ * Q. Zhao, M. Xu, P. Fränti:<br />
+ * Knee Point Detection on Bayesian Information Criterion<br />
+ * 20th IEEE International Conference on Tools with Artificial Intelligence
+ * </p>
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ */
+@Reference(authors = "D. Pelleg, A. Moore", //
+booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", //
+title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", //
+url = "http://www.pelleg.org/shared/hp/download/xmeans.ps")
+public abstract class AbstractKMeansQualityMeasure<O extends NumberVector> implements KMeansQualityMeasure<O> {
+ /**
+ * Compute the number of points in a given set of clusters (which may be
+ * <i>less</i> than the complete data set for X-means!)
+ *
+ * @param clustering Clustering to analyze
+ * @return Number of points
+ */
+ public static int numPoints(Clustering<? extends MeanModel> clustering) {
+ int n = 0;
+ for(Cluster<? extends MeanModel> aCluster : clustering.getAllClusters()) {
+ n += aCluster.size();
+ }
+ return n;
+ }
+
+ /**
+ * Variance contribution of a single cluster.
+ *
+ * If possible, this information is reused from the clustering process (when a
+ * KMeansModel is returned).
+ *
+ * @param cluster Cluster to access
+ * @param distanceFunction Distance function
+ * @param relation Data relation
+ * @return Cluster variance
+ */
+ public static double varianceOfCluster(Cluster<? extends MeanModel> cluster, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<? extends NumberVector> relation) {
+ MeanModel model = cluster.getModel();
+ if(model instanceof KMeansModel) {
+ return ((KMeansModel) model).getVarianceContribution();
+ }
+ // Re-compute:
+ DBIDs ids = cluster.getIDs();
+ Vector mean = model.getMean();
+
+ boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction);
+ double variance = 0.;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double dist = distanceFunction.distance(relation.get(iter), mean);
+ variance += squared ? dist : dist * dist;
+ }
+ return variance;
+ }
+
+ /**
+ * Computes log likelihood of an entire clustering.
+ *
+ * Version as used in the X-means publication.
+ *
+ * @param relation Data relation
+ * @param clustering Clustering
+ * @param distanceFunction Distance function
+ * @return Log Likelihood.
+ */
+ @Reference(authors = "D. Pelleg, A. Moore", //
+ booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", //
+ title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", //
+ url = "http://www.pelleg.org/shared/hp/download/xmeans.ps")
+ public static double logLikelihood(Relation<? extends NumberVector> relation, Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction) {
+ List<? extends Cluster<? extends MeanModel>> clusters = clustering.getAllClusters();
+ // dimensionality of data points
+ final int dim = RelationUtil.dimensionality(relation);
+ // number of clusters
+ final int m = clusters.size();
+
+ // number of objects in the clustering
+ int n = 0;
+ // cluster sizes
+ int[] n_i = new int[m];
+ // total variance
+ double d = 0.;
+ // variances
+ double[] d_i = new double[m];
+
+ // Iterate over clusters:
+ Iterator<? extends Cluster<? extends MeanModel>> it = clusters.iterator();
+ for(int i = 0; it.hasNext(); ++i) {
+ Cluster<? extends MeanModel> cluster = it.next();
+ n += n_i[i] = cluster.size();
+ d += d_i[i] = varianceOfCluster(cluster, distanceFunction, relation);
+ }
+
+ // Total variance (corrected for bias)
+ final double v = d / (n - m), logv = Math.log(v);
+ // log likelihood of this clustering
+ double logLikelihood = 0.;
+
+ // Aggregate
+ for(int i = 0; i < m; i++) {
+ logLikelihood += n_i[i] * Math.log(n_i[i]) // Posterior entropy, Rn log Rn
+ - n_i[i] * .5 * MathUtil.LOGTWOPI // Rn/2 log2pi
+ - n_i[i] * dim * .5 * logv // Rn M/2 log sigma^2
+ - (d_i[i] - m) * .5; // (Rn-K)/2
+ }
+ logLikelihood -= n * Math.log(n); // Prior entropy, sum_i Rn log R
+
+ return logLikelihood;
+ }
+
+ /**
+ * Computes log likelihood of an entire clustering.
+ *
+ * Version as used by Zhao et al.
+ *
+ * @param relation Data relation
+ * @param clustering Clustering
+ * @param distanceFunction Distance function
+ * @return Log Likelihood.
+ */
+ @Reference(authors = "Q. Zhao, M. Xu, P. Fränti", //
+ title = "Knee Point Detection on Bayesian Information Criterion", //
+ booktitle = "20th IEEE International Conference on Tools with Artificial Intelligence", //
+ url = "http://dx.doi.org/10.1109/ICTAI.2008.154")
+ public static double logLikelihoodAlternate(Relation<? extends NumberVector> relation, Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction) {
+ List<? extends Cluster<? extends MeanModel>> clusters = clustering.getAllClusters();
+ // dimensionality of data points
+ final int dim = RelationUtil.dimensionality(relation);
+ // number of clusters
+ final int m = clusters.size();
+
+ // number of objects in the clustering
+ int n = 0;
+ // cluster sizes
+ int[] n_i = new int[m];
+ // variances
+ double[] d_i = new double[m];
+
+ // Iterate over clusters:
+ Iterator<? extends Cluster<? extends MeanModel>> it = clusters.iterator();
+ for(int i = 0; it.hasNext(); ++i) {
+ Cluster<? extends MeanModel> cluster = it.next();
+ n += n_i[i] = cluster.size();
+ d_i[i] = varianceOfCluster(cluster, distanceFunction, relation);
+ }
+
+ // log likelihood of this clustering
+ double logLikelihood = 0.;
+
+ // Aggregate
+ for(int i = 0; i < m; i++) {
+ logLikelihood += n_i[i] * Math.log(n_i[i] / (double) n) // ni log ni/n
+ - n_i[i] * dim * .5 * MathUtil.LOGTWOPI // ni*d/2 log2pi
+ - n_i[i] * .5 * Math.log(d_i[i]) // ni/2 log sigma_i
+ - (n_i[i] - m) * .5; // (ni-m)/2
+ }
+ return logLikelihood;
+ }
+
+ /**
+ * Compute the number of free parameters.
+ *
+ * @param relation Data relation (for dimensionality)
+ * @param clustering Set of clusters
+ * @return Number of free parameters
+ */
+ public static int numberOfFreeParameters(Relation<? extends NumberVector> relation, Clustering<? extends MeanModel> clustering) {
+ // number of clusters
+ int m = clustering.getAllClusters().size(); // num_ctrs
+
+ // dimensionality of data points
+ int dim = RelationUtil.dimensionality(relation); // num_dims
+
+ // number of free parameters
+ return (m - 1) + m * dim + m;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java
new file mode 100644
index 00000000..f1619386
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java
@@ -0,0 +1,74 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Akaike Information Criterion (AIC).
+ *
+ * Reference:
+ * <p>
+ * H. Akaike<br />
+ * On entropy maximization principle<br />
+ * Application of statistics, 1977, North-Holland
+ * </p>
+ *
+ * The use for k-means was popularized by:
+ * <p>
+ * D. Pelleg, A. Moore:<br />
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters<br />
+ * In: Proceedings of the 17th International Conference on Machine Learning
+ * (ICML 2000)
+ * </p>
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ */
+@Reference(authors = "H. Akaike", //
+title = "On entropy maximization principle", //
+booktitle = "Application of statistics, 1977, North-Holland")
+public class AkaikeInformationCriterion extends AbstractKMeansQualityMeasure<NumberVector> {
+ @Override
+ public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) {
+ return logLikelihood(relation, clustering, distanceFunction) - numberOfFreeParameters(relation, clustering);
+ }
+
+ @Override
+ public boolean ascending() {
+ return true;
+ }
+
+ @Override
+ public boolean isBetter(double currentCost, double bestCost) {
+ // Careful: bestCost may be NaN!
+ return !(currentCost <= bestCost);
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java
new file mode 100644
index 00000000..73b0c501
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java
@@ -0,0 +1,77 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Bayesian Information Criterion (BIC), also known as Schwarz criterion (SBC,
+ * SBIC) for the use with evaluating k-means results.
+ *
+ * Reference:
+ * <p>
+ * G. Schwarz<br />
+ * Estimating the dimension of a model<br />
+ * The annals of statistics 6.2.
+ * </p>
+ *
+ * The use for k-means was popularized by:
+ * <p>
+ * D. Pelleg, A. Moore:<br />
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters<br />
+ * In: Proceedings of the 17th International Conference on Machine Learning
+ * (ICML 2000)
+ * </p>
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ */
+@Reference(authors = "G. Schwarz", //
+title = "Estimating the dimension of a model", //
+booktitle = "The annals of statistics 6.2", //
+url = "http://dx.doi.org/10.1214/aos/1176344136")
+public class BayesianInformationCriterion extends AbstractKMeansQualityMeasure<NumberVector> {
+ @Override
+ public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) {
+ return logLikelihood(relation, clustering, distanceFunction) //
+ - (.5 * numberOfFreeParameters(relation, clustering)) * Math.log(numPoints(clustering));
+ }
+
+ @Override
+ public boolean ascending() {
+ return true;
+ }
+
+ @Override
+ public boolean isBetter(double currentCost, double bestCost) {
+ // Careful: bestCost may be NaN!
+ return !(currentCost <= bestCost);
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java
new file mode 100644
index 00000000..3fa646c2
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java
@@ -0,0 +1,67 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Different version of the BIC criterion.
+ *
+ * Reference:
+ * <p>
+ * Q. Zhao, M. Xu, P. Fränti:<br />
+ * Knee Point Detection on Bayesian Information Criterion<br />
+ * 20th IEEE International Conference on Tools with Artificial Intelligence
+ * </p>
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ */
+@Reference(authors = "Q. Zhao, M. Xu, P. Fränti", //
+title = "Knee Point Detection on Bayesian Information Criterion", //
+booktitle = "20th IEEE International Conference on Tools with Artificial Intelligence", //
+url = "http://dx.doi.org/10.1109/ICTAI.2008.154")
+public class BayesianInformationCriterionZhao extends AbstractKMeansQualityMeasure<NumberVector> {
+ @Override
+ public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) {
+ return logLikelihoodAlternate(relation, clustering, distanceFunction) //
+ - (.5 * clustering.getAllClusters().size()) * Math.log(numPoints(clustering));
+ }
+
+ @Override
+ public boolean ascending() {
+ return true;
+ }
+
+ @Override
+ public boolean isBetter(double currentCost, double bestCost) {
+ // Careful: bestCost may be NaN!
+ return !(currentCost <= bestCost);
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
index f2de7846..2a234261 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,17 +28,17 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.MeanModel;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
/**
* Interface for computing the quality of a K-Means clustering.
*
+ * Important note: some measures are ascending, others are descending!
+ *
* @author Erich Schubert
*
* @param <O> Input Object restriction type
- * @param <D> Distance restriction type
*/
-public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Distance<?>> {
+public interface KMeansQualityMeasure<O extends NumberVector> {
/**
* Calculates and returns the quality measure.
*
@@ -50,5 +50,22 @@ public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Dista
*
* @return quality measure
*/
- <V extends O> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends D> distanceFunction, Relation<V> relation);
+ <V extends O> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation);
+
+ /**
+ * Use ascending or descending ordering.
+ *
+ * @return {@code true} when larger scores are better.
+ */
+ boolean ascending();
+
+ /**
+ * Compare two scores.
+ *
+ * @param currentCost New (candiate) cost/score
+ * @param bestCost Existing best cost/score (may be {@code NaN})
+ * @return {@code true} when the new score is better, or the old score is
+ * {@code NaN}.
+ */
+ boolean isBetter(double currentCost, double bestCost);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
index e0ddfff0..9083f924 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
/*
This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -22,8 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.List;
-
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -32,8 +30,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
/**
* Class for computing the average overall distance.
@@ -42,48 +38,35 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
*
* @author Stephan Baier
*/
-public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> {
+public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector> {
@Override
- public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) {
- @SuppressWarnings("unchecked")
- final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters();
-
- if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) {
- @SuppressWarnings("unchecked")
- PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction;
- double clusterDistanceSum = 0;
- for (Cluster<MeanModel<V>> cluster : clusterList) {
- DBIDs ids = cluster.getIDs();
+ public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) {
+ double clusterDistanceSum = 0;
+ for(Cluster<? extends MeanModel> cluster : clustering.getAllClusters()) {
+ DBIDs ids = cluster.getIDs();
- // Compute sum of pairwise distances:
- double clusterPairwiseDistanceSum = 0;
- for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) {
- V obj1 = relation.get(iter1);
- for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
- clusterPairwiseDistanceSum += df.doubleDistance(obj1, relation.get(iter2));
- }
+ // Compute sum of pairwise distances:
+ double clusterPairwiseDistanceSum = 0;
+ for(DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) {
+ V obj1 = relation.get(iter1);
+ for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2));
}
- clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size());
}
+ clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size());
+ }
- return clusterDistanceSum / clusterList.size();
- } else {
- double clusterDistanceSum = 0;
- for (Cluster<MeanModel<V>> cluster : clusterList) {
- DBIDs ids = cluster.getIDs();
+ return clusterDistanceSum / clustering.getAllClusters().size();
+ }
- // Compute sum of pairwise distances:
- double clusterPairwiseDistanceSum = 0;
- for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) {
- V obj1 = relation.get(iter1);
- for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
- clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2)).doubleValue();
- }
- }
- clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size());
- }
+ @Override
+ public boolean ascending() {
+ return false;
+ }
- return clusterDistanceSum / clusterList.size();
- }
+ @Override
+ public boolean isBetter(double currentCost, double bestCost) {
+ // Careful: bestCost may be NaN!
+ return !(currentCost >= bestCost);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
index 32ad5210..513e3ca3 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
/*
This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -22,62 +22,37 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.List;
-
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.MeanModel;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
/**
* Class for computing the variance in a clustering result (sum-of-squares).
*
* @author Stephan Baier
+ * @author Erich Schubert
*/
-public class WithinClusterVarianceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> {
+public class WithinClusterVarianceQualityMeasure extends AbstractKMeansQualityMeasure<NumberVector> {
@Override
- public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) {
- @SuppressWarnings("unchecked")
- final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters();
-
- boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction);
- if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) {
- @SuppressWarnings("unchecked")
- PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction;
- double variance = 0.0;
- for (Cluster<MeanModel<V>> cluster : clusterList) {
- DBIDs ids = cluster.getIDs();
- V mean = cluster.getModel().getMean();
+ public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) {
+ double variance = 0.;
+ for(Cluster<? extends MeanModel> cluster : clustering.getAllClusters()) {
+ variance += varianceOfCluster(cluster, distanceFunction, relation);
+ }
+ return variance;
+ }
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- double dist = df.doubleDistance(relation.get(iter), mean);
- if (squared) {
- variance += dist;
- } else {
- variance += dist * dist;
- }
- }
- }
- return variance;
- } else {
- double variance = 0.0;
- for (Cluster<MeanModel<V>> cluster : clusterList) {
- DBIDs ids = cluster.getIDs();
- V mean = cluster.getModel().getMean();
+ @Override
+ public boolean ascending() {
+ return false;
+ }
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- double dist = distanceFunction.distance(relation.get(iter), mean).doubleValue();
- variance += dist * dist;
- }
- }
- return variance;
- }
+ @Override
+ public boolean isBetter(double currentCost, double bestCost) {
+ // Careful: bestCost may be NaN!
+ return !(currentCost >= bestCost);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
index 1be19bd1..767048db 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
@@ -6,7 +6,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team