summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java182
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java219
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java186
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java46
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java231
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java86
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java88
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java16
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java76
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java160
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java54
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java89
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java83
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java4
24 files changed, 1330 insertions, 293 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index 47855aad..dc1fa47c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -43,9 +43,17 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Abstract base class for k-means implementations.
@@ -59,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
* @param <D> Distance type
* @param <M> Cluster model type
*/
-public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans, ClusteringAlgorithm<Clustering<M>> {
+public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans<V, D, M>, ClusteringAlgorithm<Clustering<M>> {
/**
* Holds the value of {@link #K_ID}.
*/
@@ -102,54 +110,53 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) {
boolean changed = false;
- if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
V fv = relation.get(iditer);
int minIndex = 0;
- for(int i = 0; i < k; i++) {
+ for (int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
- if(dist < mindist) {
+ if (dist < mindist) {
minIndex = i;
mindist = dist;
}
}
- if(clusters.get(minIndex).add(iditer)) {
+ if (clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for(int i = 0; i < k; i++) {
- if(i != minIndex) {
- if(clusters.get(i).remove(iditer)) {
+ for (int i = 0; i < k; i++) {
+ if (i != minIndex) {
+ if (clusters.get(i).remove(iditer)) {
break;
}
}
}
}
}
- }
- else {
+ } else {
final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
V fv = relation.get(iditer);
int minIndex = 0;
- for(int i = 0; i < k; i++) {
+ for (int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
- if(dist.compareTo(mindist) < 0) {
+ if (dist.compareTo(mindist) < 0) {
minIndex = i;
mindist = dist;
}
}
- if(clusters.get(minIndex).add(iditer)) {
+ if (clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for(int i = 0; i < k; i++) {
- if(i != minIndex) {
- if(clusters.get(i).remove(iditer)) {
+ for (int i = 0; i < k; i++) {
+ if (i != minIndex) {
+ if (clusters.get(i).remove(iditer)) {
break;
}
}
@@ -174,21 +181,24 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @return the mean vectors of the given clusters in the given database
*/
protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) {
- List<Vector> newMeans = new ArrayList<Vector>(k);
- for(int i = 0; i < k; i++) {
+ List<Vector> newMeans = new ArrayList<>(k);
+ for (int i = 0; i < k; i++) {
ModifiableDBIDs list = clusters.get(i);
Vector mean = null;
- if(list.size() > 0) {
+ if (list.size() > 0) {
double s = 1.0 / list.size();
DBIDIter iter = list.iter();
assert (iter.valid());
mean = database.get(iter).getColumnVector().timesEquals(s);
+ double[] raw = mean.getArrayRef();
iter.advance();
- for(; iter.valid(); iter.advance()) {
- mean.plusTimesEquals(database.get(iter).getColumnVector(), s);
+ for (; iter.valid(); iter.advance()) {
+ NumberVector<?> vec = database.get(iter);
+ for (int j = 0; j < mean.getDimensionality(); j++) {
+ raw[j] += s * vec.doubleValue(j);
+ }
}
- }
- else {
+ } else {
mean = means.get(i).getColumnVector();
}
newMeans.add(mean);
@@ -207,19 +217,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
protected List<NumberVector<?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> medians, Relation<V> database) {
final int dim = medians.get(0).getDimensionality();
final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database);
- List<NumberVector<?>> newMedians = new ArrayList<NumberVector<?>>(k);
- for(int i = 0; i < k; i++) {
+ List<NumberVector<?>> newMedians = new ArrayList<>(k);
+ for (int i = 0; i < k; i++) {
ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i));
- if(list.size() > 0) {
+ if (list.size() > 0) {
Vector mean = new Vector(dim);
- for(int d = 0; d < dim; d++) {
+ for (int d = 0; d < dim; d++) {
sorter.setDimension(d);
DBID id = QuickSelect.median(list, sorter);
mean.set(d, database.get(id).doubleValue(d));
}
newMedians.add(mean);
- }
- else {
+ } else {
newMedians.add((NumberVector<?>) medians.get(i));
}
}
@@ -235,7 +244,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param op Cluster size change / Weight change
*/
protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) {
- if(newsize == 0) {
+ if (newsize == 0) {
return; // Keep old mean
}
Vector delta = vec.getColumnVector();
@@ -256,65 +265,62 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
boolean changed = false;
- if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
// Raw distance function
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
// Incremental update
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
V fv = relation.get(iditer);
int minIndex = 0;
- for(int i = 0; i < k; i++) {
+ for (int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
- if(dist < mindist) {
+ if (dist < mindist) {
minIndex = i;
mindist = dist;
}
}
// Update the cluster mean incrementally:
- for(int i = 0; i < k; i++) {
+ for (int i = 0; i < k; i++) {
ModifiableDBIDs ci = clusters.get(i);
- if(i == minIndex) {
- if(ci.add(iditer)) {
+ if (i == minIndex) {
+ if (ci.add(iditer)) {
incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
changed = true;
}
- }
- else if(ci.remove(iditer)) {
+ } else if (ci.remove(iditer)) {
incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
changed = true;
}
}
}
- }
- else {
+ } else {
// Raw distance function
final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
// Incremental update
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
V fv = relation.get(iditer);
int minIndex = 0;
- for(int i = 0; i < k; i++) {
+ for (int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
- if(dist.compareTo(mindist) < 0) {
+ if (dist.compareTo(mindist) < 0) {
minIndex = i;
mindist = dist;
}
}
// Update the cluster mean incrementally:
- for(int i = 0; i < k; i++) {
+ for (int i = 0; i < k; i++) {
ModifiableDBIDs ci = clusters.get(i);
- if(i == minIndex) {
- if(ci.add(iditer)) {
+ if (i == minIndex) {
+ if (ci.add(iditer)) {
incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
changed = true;
}
- }
- else if(ci.remove(iditer)) {
+ } else if (ci.remove(iditer)) {
incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
changed = true;
}
@@ -323,4 +329,76 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
}
return changed;
}
+
+ @Override
+ public void setK(int k) {
+ this.k = k;
+ }
+
+ @Override
+ public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) {
+ this.distanceFunction = distanceFunction;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public abstract static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> {
+ /**
+ * k Parameter.
+ */
+ protected int k;
+
+ /**
+ * Maximum number of iterations.
+ */
+ protected int maxiter;
+
+ /**
+ * Initialization method.
+ */
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
+ if (config.grab(distanceFunctionP)) {
+ distanceFunction = distanceFunctionP.instantiateClass(config);
+ if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
+ getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
+ }
+ }
+
+ IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(new GreaterConstraint(0));
+ if (config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class);
+ if (config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
+ maxiterP.addConstraint(new GreaterEqualConstraint(0));
+ if (config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ /**
+ * Get class logger.
+ *
+ * @return Logger
+ */
+ abstract protected Logging getLogger();
+
+ @Override
+ abstract protected AbstractKMeans<V, D, ?> makeInstance();
+ }
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
index 3a69c806..9e3eb478 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
new file mode 100644
index 00000000..30bb640c
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
@@ -0,0 +1,219 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality.KMeansQualityMeasure;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Run K-Means multiple times, and keep the best run.
+ *
+ * @author Stephan Baier
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ * @param <M> Model type
+ */
+public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(BestOfMultipleKMeans.class);
+
+ /**
+ * Number of trials to do.
+ */
+ private int trials;
+
+ /**
+ * Variant of kMeans for the bisecting step.
+ */
+ private KMeans<V, D, M> innerkMeans;
+
+ /**
+ * Quality measure which should be used.
+ */
+ private KMeansQualityMeasure<? super V, ? super D> qualityMeasure;
+
+ /**
+ * Constructor.
+ *
+ * @param trials Number of trials to do.
+ * @param innerkMeans K-Means variant to actually use.
+ * @param qualityMeasure Quality measure
+ */
+ public BestOfMultipleKMeans(int trials, KMeans<V, D, M> innerkMeans, KMeansQualityMeasure<? super V, ? super D> qualityMeasure) {
+ super();
+ this.trials = trials;
+ this.innerkMeans = innerkMeans;
+ this.qualityMeasure = qualityMeasure;
+ }
+
+ @Override
+ public Clustering<M> run(Database database, Relation<V> relation) {
+ if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
+ throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass());
+ }
+ final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction();
+ Clustering<M> bestResult = null;
+ if (trials > 1) {
+ double bestCost = Double.POSITIVE_INFINITY;
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
+ for (int i = 0; i < trials; i++) {
+ Clustering<M> currentCandidate = innerkMeans.run(database, relation);
+ double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation);
+
+ if (LOG.isVerbose()) {
+ LOG.verbose("Cost of candidate " + i + ": " + currentCost);
+ }
+
+ if (currentCost < bestCost) {
+ bestResult = currentCandidate;
+ bestCost = currentCost;
+ }
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ }
+ if (prog != null) {
+ prog.ensureCompleted(LOG);
+ }
+ } else {
+ bestResult = innerkMeans.run(database);
+ }
+
+ return bestResult;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return innerkMeans.getInputTypeRestriction();
+ }
+
+ @Override
+ public DistanceFunction<? super V, D> getDistanceFunction() {
+ return innerkMeans.getDistanceFunction();
+ }
+
+ @Override
+ public void setK(int k) {
+ innerkMeans.setK(k);
+ }
+
+ @Override
+ public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) {
+ innerkMeans.setDistanceFunction(distanceFunction);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Stephan Baier
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ * @param <M> Model type
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the iterations of the bisecting step.
+ */
+ public static final OptionID TRIALS_ID = new OptionID("kmeans.trials", "The number of trials to run.");
+
+ /**
+ * Parameter to specify the kMeans variant.
+ */
+ public static final OptionID KMEANS_ID = new OptionID("kmeans.algorithm", "KMeans variant to run multiple times.");
+
+ /**
+ * Parameter to specify the variant of quality measure.
+ */
+ public static final OptionID QUALITYMEASURE_ID = new OptionID("kmeans.qualitymeasure", "Quality measure variant for deciding which run to keep.");
+
+ /**
+ * Number of trials to perform.
+ */
+ protected int trials;
+
+ /**
+ * Variant of kMeans to use.
+ */
+ protected KMeans<V, D, M> kMeansVariant;
+
+ /**
+ * Quality measure.
+ */
+ protected KMeansQualityMeasure<? super V, ? super D> qualityMeasure;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ IntParameter trialsP = new IntParameter(TRIALS_ID);
+ trialsP.addConstraint(new GreaterEqualConstraint(1));
+ if (config.grab(trialsP)) {
+ trials = trialsP.intValue();
+ }
+
+ ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
+ if (config.grab(kMeansVariantP)) {
+ kMeansVariant = kMeansVariantP.instantiateClass(config);
+ }
+
+ ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class);
+ if (config.grab(qualityMeasureP)) {
+ qualityMeasure = qualityMeasureP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected BestOfMultipleKMeans<V, D, M> makeInstance() {
+ return new BestOfMultipleKMeans<>(trials, kMeansVariant, qualityMeasure);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
new file mode 100644
index 00000000..a018c04b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
@@ -0,0 +1,186 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+
+/**
+ * K-Means initialization by repeatedly choosing the farthest point.
+ *
+ * Note: this is less random than other initializations, so running multiple
+ * times will be more likely to return the same local minima.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
+ /**
+ * Discard the first vector.
+ */
+ boolean dropfirst = true;
+
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ * @param dropfirst Flag to discard the first vector.
+ */
+ public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) {
+ super(rnd);
+ this.dropfirst = dropfirst;
+ }
+
+ @Override
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
+ // Get a distance query
+ if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
+ DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF);
+
+ // Chose first mean
+ List<V> means = new ArrayList<>(k);
+
+ Random random = rnd.getRandom();
+ DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter();
+ means.add(relation.get(first));
+
+ DBIDVar best = DBIDUtil.newVar(first);
+ for (int i = (dropfirst ? 0 : 1); i < k; i++) {
+ // Find farthest object:
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ double dsum = 0.;
+ for (V ex : means) {
+ dsum += distQ.distance(ex, it).doubleValue();
+ }
+ if (dsum > maxdist) {
+ maxdist = dsum;
+ best.set(it);
+ }
+ }
+ // Add new mean:
+ if (k == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ means.add(relation.get(best));
+ }
+
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
+ if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
+ final Relation<?> relation = distQ.getRelation();
+ // Chose first mean
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ Random random = rnd.getRandom();
+ DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter();
+ means.add(first);
+
+ DBIDVar best = DBIDUtil.newVar(first);
+ for (int i = (dropfirst ? 0 : 1); i < k; i++) {
+ // Find farthest object:
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ double dsum = 0.;
+ for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) {
+ dsum += distQ.distance(ex, it).doubleValue();
+ }
+ if (dsum > maxdist) {
+ maxdist = dsum;
+ best.set(it);
+ }
+ }
+ // Add new mean:
+ if (k == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ means.add(best);
+ }
+
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ /**
+ * Option ID to control the handling of the first object chosen.
+ */
+ public static final OptionID DROPFIRST_ID = new OptionID("farthest.dropfirst", "Drop the first object chosen (which is chosen randomly) for the farthest points heuristic.");
+
+ /**
+ * Flag for discarding the first object chosen.
+ */
+ protected boolean dropfirst = true;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ Flag dropfirstP = new Flag(DROPFIRST_ID);
+ if (config.grab(dropfirstP)) {
+ dropfirst = dropfirstP.isTrue();
+ }
+ }
+
+ @Override
+ protected FarthestPointsInitialMeans<V, D> makeInstance() {
+ return new FarthestPointsInitialMeans<>(rnd, dropfirst);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
index 1e51f4d6..08e2f116 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
@@ -51,9 +52,9 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI
}
@Override
- public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
DBIDIter iter = relation.iterDBIDs();
- List<V> means = new ArrayList<V>(k);
+ List<V> means = new ArrayList<>(k);
for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
means.add(relation.get(iter));
}
@@ -80,7 +81,7 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI
public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
@Override
protected FirstKInitialMeans<V> makeInstance() {
- return new FirstKInitialMeans<V>();
+ return new FirstKInitialMeans<>();
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
index 68fc4e48..29c0a5c8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
@@ -1,12 +1,10 @@
package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,12 +23,27 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+
/**
* Some constants and options shared among kmeans family algorithms.
*
* @author Erich Schubert
+ *
+ * @param <V> Number vector type
+ * @param <D> Distance type
+ * @param <M> Actual model type
*/
-public interface KMeans {
+public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V, D> {
/**
* Parameter to specify the initialization method
*/
@@ -52,4 +65,27 @@ public interface KMeans {
* Parameter to specify the random generator seed.
*/
public static final OptionID SEED_ID = new OptionID("kmeans.seed", "The random number generator seed.");
-} \ No newline at end of file
+
+ /**
+ * Run the clustering algorithm.
+ *
+ * @param database Database to run on.
+ * @param rel Relation to process.
+ * @return Clustering result
+ */
+ Clustering<M> run(Database database, Relation<V> rel);
+
+ /**
+ * Set the value of k. Needed for some types of nested k-means.
+ *
+ * @param k K parameter
+ */
+ void setK(int k);
+
+ /**
+ * Set the distance function to use.
+ *
+ * @param distanceFunction Distance function.
+ */
+ void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction);
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
new file mode 100644
index 00000000..37071d36
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
@@ -0,0 +1,231 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.LinkedList;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * The bisecting k-means algorithm works by starting with an initial
+ * partitioning into two clusters, then repeated splitting of the largest
+ * cluster to get additional clusters.
+ *
+ * Reference:<br>
+ * <p>
+ * M. Steinbach, G. Karypis, V. Kumar:<br />
+ * A Comparison of Document Clustering Techniques<br />
+ * KDD workshop on text mining. Vol. 400. No. 1
+ * </p>
+ *
+ * @author Stephan Baier
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ * @param <M> Model type
+ */
+@Reference(authors = "M. Steinbach, G. Karypis, V. Kumar", title = "A Comparison of Document Clustering Techniques", booktitle = "KDD workshop on text mining. Vol. 400. No. 1")
+public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansBisecting.class);
+
+ /**
+ * Variant of kMeans for the bisecting step.
+ */
+ private KMeans<V, D, M> innerkMeans;
+
+ /**
+ * Desired value of k.
+ */
+ private int k;
+
+ /**
+ * Constructor.
+ *
+ * @param k k parameter - number of result clusters
+ * @param innerkMeans KMeans variant parameter - for bisecting step
+ */
+ public KMeansBisecting(int k, KMeans<V, D, M> innerkMeans) {
+ super();
+ this.k = k;
+ this.innerkMeans = innerkMeans;
+ }
+
+ @Override
+ public Clustering<M> run(Database database, Relation<V> relation) {
+ ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
+
+ // Linked list is preferrable for scratch, as we will A) not need that many
+ // clusters and B) be doing random removals of the largest cluster (often at
+ // the head)
+ LinkedList<Cluster<M>> currentClusterList = new LinkedList<>();
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null;
+
+ for (int j = 0; j < this.k - 1; j++) {
+ // Choose a cluster to split and project database to cluster
+ if (currentClusterList.size() == 0) {
+ proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
+ } else {
+ Cluster<M> largestCluster = null;
+ for (Cluster<M> cluster : currentClusterList) {
+ if (largestCluster == null || cluster.size() > largestCluster.size()) {
+ largestCluster = cluster;
+ }
+ }
+ currentClusterList.remove(largestCluster);
+ proxyDB.setDBIDs(largestCluster.getIDs());
+ }
+
+ // Run the inner k-means algorithm:
+ // FIXME: ensure we run on the correct relation in a multirelational
+ // setting!
+ Clustering<M> innerResult = innerkMeans.run(proxyDB);
+ // Add resulting clusters to current result.
+ currentClusterList.addAll(innerResult.getAllClusters());
+
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ if (LOG.isVerbose()) {
+ LOG.verbose("Iteration " + j);
+ }
+ }
+ if (prog != null) {
+ prog.ensureCompleted(LOG);
+ }
+
+ // add all current clusters to the result
+ Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
+ for (Cluster<M> cluster : currentClusterList) {
+ result.addToplevelCluster(cluster);
+ }
+ return result;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return innerkMeans.getInputTypeRestriction();
+ }
+
+ @Override
+ public DistanceFunction<? super V, D> getDistanceFunction() {
+ return innerkMeans.getDistanceFunction();
+ }
+
+ @Override
+ public void setK(int k) {
+ this.k = k;
+ }
+
+ @Override
+ public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) {
+ innerkMeans.setDistanceFunction(distanceFunction);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Stephan Baier
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ * @param <M> Model type
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the kMeans variant.
+ */
+ public static final OptionID KMEANS_ID = new OptionID("bisecting.kmeansvariant", "KMeans variant");
+
+ /**
+ * Variant of kMeans
+ */
+ protected KMeans<V, D, M> kMeansVariant;
+
+ /**
+ * Desired number of clusters.
+ */
+ protected int k;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ IntParameter kP = new IntParameter(KMeans.K_ID);
+ kP.addConstraint(new GreaterConstraint(1));
+ if (config.grab(kP)) {
+ k = kP.intValue();
+ }
+
+ ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class);
+ if (config.grab(kMeansVariantP)) {
+ ListParameterization kMeansVariantParameters = new ListParameterization();
+
+ // We will always invoke this with k=2!
+ kMeansVariantParameters.addParameter(KMeans.K_ID, 2);
+
+ ChainedParameterization combinedConfig = new ChainedParameterization(kMeansVariantParameters, config);
+ combinedConfig.errorsTo(config);
+ kMeansVariant = kMeansVariantP.instantiateClass(combinedConfig);
+ }
+ }
+
+ @Override
+ protected KMeansBisecting<V, D, M> makeInstance() {
+ return new KMeansBisecting<>(k, kMeansVariant);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
index 54b3a2ce..06fb10c1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,6 +24,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
*/
import java.util.List;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
@@ -31,7 +33,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
* Interface for initializing K-Means
*
* @author Erich Schubert
- *
+ *
* @apiviz.landmark
*
* @param <V> Object type
@@ -40,10 +42,12 @@ public interface KMeansInitialization<V> {
/**
* Choose initial means
*
+ * @param database Database context
* @param relation Relation
* @param k Parameter k
- * @param distanceFunction Distance function
+ * @param distanceFunction Distance function
+ *
* @return List of chosen means for k-means
*/
- public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
+ public abstract List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index f43c2277..e692293c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -36,19 +35,13 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Provides the k-means algorithm, using Lloyd-style bulk iterations.
@@ -90,28 +83,23 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
super(distanceFunction, k, maxiter, initializer);
}
- /**
- * Run k-means.
- *
- * @param database Database
- * @param relation relation to use
- * @return result
- */
+ @Override
public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
- return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering");
+ return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
- List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (LOG.isVerbose()) {
- LOG.verbose("K-Means iteration " + (iteration + 1));
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
}
boolean changed = assignToNearestCluster(relation, means, clusters);
// Stop if no cluster assignment changed.
@@ -121,12 +109,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
// Recompute means.
means = means(clusters, means, relation);
}
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
+
// Wrap result
final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering");
+ Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
- KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
- result.addCluster(new Cluster<KMeansModel<V>>(clusters.get(i), model));
+ KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
return result;
}
@@ -143,53 +135,15 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> {
- /**
- * k Parameter.
- */
- protected int k;
-
- /**
- * Number of iterations.
- */
- protected int maxiter;
-
- /**
- * Initialization method.
- */
- protected KMeansInitialization<V> initializer;
-
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
@Override
- protected void makeOptions(Parameterization config) {
- ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
- if(config.grab(distanceFunctionP)) {
- distanceFunction = distanceFunctionP.instantiateClass(config);
- if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
- LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
- }
- }
-
- IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
- k = kP.getValue();
- }
-
- ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
- if (config.grab(initialP)) {
- initializer = initialP.instantiateClass(config);
- }
-
- IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
- maxiter = maxiterP.intValue();
- }
+ protected Logging getLogger() {
+ return LOG;
}
@Override
protected KMeansLloyd<V, D> makeInstance() {
- return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer);
+ return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index 0cc7c363..bb689bd3 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -37,20 +36,14 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Provides the k-means algorithm, using MacQueen style incremental updates.
@@ -89,24 +82,18 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
super(distanceFunction, k, maxiter, initializer);
}
- /**
- * Run k-means.
- *
- * @param database Database
- * @param relation relation to use
- * @return Clustering result
- */
+ @Override
public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
- return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering");
+ return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = new ArrayList<Vector>(k);
- for (NumberVector<?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) {
+ List<Vector> means = new ArrayList<>(k);
+ for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) {
means.add(nv.getColumnVector());
}
// Initialize cluster and assign objects
- List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
@@ -114,22 +101,27 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
// Initial recomputation of the means.
means = means(clusters, means, relation);
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
// Refine result
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (LOG.isVerbose()) {
- LOG.verbose("K-Means iteration " + (iteration + 1));
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
}
boolean changed = macQueenIterate(relation, means, clusters);
if (!changed) {
break;
}
}
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
+
final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering");
+ Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
- KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
- result.addCluster(new Cluster<KMeansModel<V>>(ids, model));
+ KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
@@ -146,53 +138,15 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> {
- /**
- * k Parameter.
- */
- protected int k;
-
- /**
- * Maximum number of iterations.
- */
- protected int maxiter;
-
- /**
- * Initialization method.
- */
- protected KMeansInitialization<V> initializer;
-
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
@Override
- protected void makeOptions(Parameterization config) {
- ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
- if (config.grab(distanceFunctionP)) {
- distanceFunction = distanceFunctionP.instantiateClass(config);
- if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
- LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
- }
- }
-
- IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
- k = kP.getValue();
- }
-
- ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
- if (config.grab(initialP)) {
- initializer = initialP.instantiateClass(config);
- }
-
- IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
- maxiter = maxiterP.getValue();
- }
+ protected Logging getLogger() {
+ return LOG;
}
@Override
protected KMeansMacQueen<V, D> makeInstance() {
- return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer);
+ return new KMeansMacQueen<>(distanceFunction, k, maxiter, initializer);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
index a07953da..302ca86b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +26,8 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Random;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
@@ -70,17 +72,17 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
}
@Override
- public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
// Get a distance query
if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("K-Means++ initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
- DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
+ DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF);
// Chose first mean
- List<V> means = new ArrayList<V>(k);
+ List<V> means = new ArrayList<>(k);
Random random = rnd.getRandom();
DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter());
@@ -99,7 +101,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
}
double r = random.nextDouble() * weightsum;
int pos = 0;
- while(r > 0 && pos < weights.length) {
+ while(r > 0 && pos < weights.length - 1) {
r -= weights[pos];
pos++;
}
@@ -125,7 +127,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
@Override
public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
- throw new AbortException("PAM initialization can only be used with numerical distances.");
+ throw new AbortException("K-Means++ initialization initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
@@ -244,7 +246,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
@Override
protected KMeansPlusPlusInitialMeans<V, D> makeInstance() {
- return new KMeansPlusPlusInitialMeans<V, D>(rnd);
+ return new KMeansPlusPlusInitialMeans<>(rnd);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
index 9917337e..cc7aaa9e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -39,13 +38,9 @@ import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Provides the k-medians clustering algorithm, using Lloyd-style bulk
@@ -83,28 +78,23 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext
super(distanceFunction, k, maxiter, initializer);
}
- /**
- * Run k-medians.
- *
- * @param database Database
- * @param relation relation to use
- * @return result
- */
+ @Override
public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
- return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering");
+ return new Clustering<>("k-Medians Clustering", "kmedians-clustering");
}
// Choose initial medians
- List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
- List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
- if (LOG.isVerbose()) {
- LOG.verbose("K-Medians iteration " + (iteration + 1));
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
}
boolean changed = assignToNearestCluster(relation, medians, clusters);
// Stop if no cluster assignment changed.
@@ -114,12 +104,15 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext
// Recompute medians.
medians = medians(clusters, medians, relation);
}
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
// Wrap result
final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering");
+ Clustering<MeanModel<V>> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering");
for (int i = 0; i < clusters.size(); i++) {
- MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef()));
- result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
+ MeanModel<V> model = new MeanModel<>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
return result;
}
@@ -136,46 +129,15 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> {
- /**
- * k Parameter.
- */
- protected int k;
-
- /**
- * Maximum number of iterations.
- */
- protected int maxiter;
-
- /**
- * Initialization method.
- */
- protected KMeansInitialization<V> initializer;
-
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
@Override
- protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
- IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
- k = kP.intValue();
- }
-
- ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
- if (config.grab(initialP)) {
- initializer = initialP.instantiateClass(config);
- }
-
- IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
- maxiter = maxiterP.intValue();
- }
+ protected Logging getLogger() {
+ return LOG;
}
@Override
protected KMediansLloyd<V, D> makeInstance() {
- return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer);
+ return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
index f4398458..87a0c7ae 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -46,6 +46,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.Mean;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
@@ -119,13 +120,13 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
- return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
// Choose initial medoids
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
// Setup cluster assignment store
- List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
@@ -135,9 +136,13 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
// TODO: reuse this information, from the build phase, when possible?
assignToNearestCluster(medoids, mdists, clusters, distQ);
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null;
// Swap phase
boolean changed = true;
while (changed) {
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
@@ -168,12 +173,15 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
assignToNearestCluster(medoids, mdists, clusters, distQ);
}
}
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
// Wrap result
- Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
for (int i = 0; i < clusters.size(); i++) {
MedoidModel model = new MedoidModel(medoids.get(i));
- result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
return result;
}
@@ -256,7 +264,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
k = kP.intValue();
}
- ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
if (config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
@@ -270,7 +278,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
@Override
protected KMedoidsEM<V, D> makeInstance() {
- return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer);
+ return new KMedoidsEM<>(distanceFunction, k, maxiter, initializer);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
index 269e7e9e..136a4129 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
index 906501e4..1feda867 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -50,6 +50,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
@@ -124,14 +125,14 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
- return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
DBIDs ids = relation.getDBIDs();
// Choose initial medoids
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
// Setup cluster assignment store
- List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
@@ -141,9 +142,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
// TODO: reuse this information, from the build phase, when possible?
assignToNearestCluster(medoids, ids, second, clusters, distQ);
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
// Swap phase
boolean changed = true;
while (changed) {
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
changed = false;
// Try to swap the medoid with a better cluster member:
double best = 0;
@@ -189,6 +194,9 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
}
}
}
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
if (LOG.isDebugging()) {
LOG.debug("Best cost: " + best);
}
@@ -204,10 +212,10 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
}
// Wrap result
- Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
for (int i = 0; i < clusters.size(); i++) {
MedoidModel model = new MedoidModel(medoids.get(i));
- result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
return result;
}
@@ -293,7 +301,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
k = kP.intValue();
}
- ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
if (config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
@@ -307,7 +315,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
@Override
protected KMedoidsPAM<V, D> makeInstance() {
- return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer);
+ return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
index 1fc7160e..c7e1751f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,6 +25,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
@@ -69,16 +71,16 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean
}
@Override
- public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
// Get a distance query
if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("PAM initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
- final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
+ final DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF);
DBIDs medids = chooseInitialMedoids(k, distQ);
- List<V> medoids = new ArrayList<V>(k);
+ List<V> medoids = new ArrayList<>(k);
for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) {
medoids.add(relation.get(iter));
}
@@ -179,7 +181,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean
public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
@Override
protected PAMInitialMeans<V, D> makeInstance() {
- return new PAMInitialMeans<V, D>();
+ return new PAMInitialMeans<>();
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
index 78e59be7..214f4ce6 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,6 +25,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
@@ -52,9 +54,9 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<
}
@Override
- public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd);
- List<V> means = new ArrayList<V>(k);
+ List<V> means = new ArrayList<>(k);
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
means.add(relation.get(iter));
}
@@ -74,10 +76,9 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<
* @apiviz.exclude
*/
public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> {
-
@Override
protected RandomlyChosenInitialMeans<V> makeInstance() {
- return new RandomlyChosenInitialMeans<V>(rnd);
+ return new RandomlyChosenInitialMeans<>(rnd);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
index 300f5cb0..ee90e0dc 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,6 +27,7 @@ import java.util.List;
import java.util.Random;
import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
@@ -54,11 +55,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab
}
@Override
- public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
final int dim = RelationUtil.dimensionality(relation);
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
- List<V> means = new ArrayList<V>(k);
+ List<V> means = new ArrayList<>(k);
final Random random = rnd.getRandom();
for(int i = 0; i < k; i++) {
double[] r = MathUtil.randomDoubleArray(dim, random);
@@ -81,7 +82,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab
public static class Parameterizer<V extends NumberVector<?>> extends AbstractKMeansInitialization.Parameterizer<V> {
@Override
protected RandomlyGeneratedInitialMeans<V> makeInstance() {
- return new RandomlyGeneratedInitialMeans<V>(rnd);
+ return new RandomlyGeneratedInitialMeans<>(rnd);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
new file mode 100644
index 00000000..9f0a1923
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
@@ -0,0 +1,160 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.ProxyView;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Initialize k-means by running k-means on a sample of the data set only.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Variant of kMeans for the bisecting step.
+ */
+ private KMeans<V, D, ?> innerkMeans;
+
+ /**
+ * Sample size.
+ */
+ private double rate;
+
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ * @param innerkMeans Inner k-means algorithm.
+ * @param rate Sampling rate.
+ */
+ public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, D, ?> innerkMeans, double rate) {
+ super(rnd);
+ this.innerkMeans = innerkMeans;
+ this.rate = rate;
+ }
+
+ @Override
+ public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
+ final int samplesize = (int) Math.ceil(rate * relation.size());
+ final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), samplesize, rnd);
+
+ ProxyView<V> proxyv = new ProxyView<>(database, sample, relation);
+ ProxyDatabase proxydb = new ProxyDatabase(sample, proxyv);
+
+ innerkMeans.setK(k);
+ @SuppressWarnings("unchecked")
+ PrimitiveDistanceFunction<? super NumberVector<?>, D> df = (PrimitiveDistanceFunction<? super NumberVector<?>, D>) distanceFunction;
+ innerkMeans.setDistanceFunction(df);
+ Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv);
+ List<V> means = new ArrayList<>();
+ for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) {
+ means.add((V) cluster.getModel().getMean());
+ }
+
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ /**
+ * Parameter to specify the kMeans variant.
+ */
+ public static final OptionID KMEANS_ID = new OptionID("kmeans.algorithm", "KMeans variant to run multiple times.");
+
+ /**
+ * Parameter to specify the sampling rate.
+ */
+ public static final OptionID SAMPLE_ID = new OptionID("kmeans.samplesize", "Sample set size (if > 1) or sampling rante (if < 1).");
+
+ /**
+ * Inner k-means algorithm to use.
+ */
+ protected KMeans<V, D, ?> innerkMeans;
+
+ /**
+ * Sampling rate.
+ */
+ protected double rate;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<KMeans<V, D, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
+ if (config.grab(kMeansVariantP)) {
+ ListParameterization kMeansVariantParameters = new ListParameterization();
+
+ // We will always invoke this with k as requested from outside!
+ kMeansVariantParameters.addParameter(KMeans.K_ID, 13);
+ kMeansVariantParameters.addParameter(KMeans.DISTANCE_FUNCTION_ID, SquaredEuclideanDistanceFunction.class);
+
+ ChainedParameterization combinedConfig = new ChainedParameterization(kMeansVariantParameters, config);
+ combinedConfig.errorsTo(config);
+ innerkMeans = kMeansVariantP.instantiateClass(combinedConfig);
+ }
+
+ DoubleParameter sampleP = new DoubleParameter(SAMPLE_ID);
+ if (config.grab(sampleP)) {
+ rate = sampleP.doubleValue();
+ }
+ }
+
+ @Override
+ protected SampleKMeansInitialization<V, D> makeInstance() {
+ return new SampleKMeansInitialization<>(rnd, innerkMeans, rate);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
index 2ce625b0..aa4c3e24 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2012
+Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
new file mode 100644
index 00000000..f2de7846
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java
@@ -0,0 +1,54 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+
+/**
+ * Interface for computing the quality of a K-Means clustering.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Input Object restriction type
+ * @param <D> Distance restriction type
+ */
+public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Distance<?>> {
+ /**
+ * Calculates and returns the quality measure.
+ *
+ * @param clustering Clustering to analyze
+ * @param distanceFunction Distance function to use (usually Euclidean or
+ * squared Euclidean!)
+ * @param relation Relation for accessing objects
+ * @param <V> Actual vector type (could be a subtype of O!)
+ *
+ * @return quality measure
+ */
+ <V extends O> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends D> distanceFunction, Relation<V> relation);
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
new file mode 100644
index 00000000..e0ddfff0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java
@@ -0,0 +1,89 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+
+/**
+ * Class for computing the average overall distance.
+ *
+ * The average of all average pairwise distances in a cluster.
+ *
+ * @author Stephan Baier
+ */
+public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> {
+ @Override
+ public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) {
+ @SuppressWarnings("unchecked")
+ final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters();
+
+ if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction;
+ double clusterDistanceSum = 0;
+ for (Cluster<MeanModel<V>> cluster : clusterList) {
+ DBIDs ids = cluster.getIDs();
+
+ // Compute sum of pairwise distances:
+ double clusterPairwiseDistanceSum = 0;
+ for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) {
+ V obj1 = relation.get(iter1);
+ for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ clusterPairwiseDistanceSum += df.doubleDistance(obj1, relation.get(iter2));
+ }
+ }
+ clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size());
+ }
+
+ return clusterDistanceSum / clusterList.size();
+ } else {
+ double clusterDistanceSum = 0;
+ for (Cluster<MeanModel<V>> cluster : clusterList) {
+ DBIDs ids = cluster.getIDs();
+
+ // Compute sum of pairwise distances:
+ double clusterPairwiseDistanceSum = 0;
+ for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) {
+ V obj1 = relation.get(iter1);
+ for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2)).doubleValue();
+ }
+ }
+ clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size());
+ }
+
+ return clusterDistanceSum / clusterList.size();
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
new file mode 100644
index 00000000..32ad5210
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java
@@ -0,0 +1,83 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
+
+/*
+ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+
+/**
+ * Class for computing the variance in a clustering result (sum-of-squares).
+ *
+ * @author Stephan Baier
+ */
+public class WithinClusterVarianceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> {
+ @Override
+ public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) {
+ @SuppressWarnings("unchecked")
+ final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters();
+
+ boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction);
+ if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction;
+ double variance = 0.0;
+ for (Cluster<MeanModel<V>> cluster : clusterList) {
+ DBIDs ids = cluster.getIDs();
+ V mean = cluster.getModel().getMean();
+
+ for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double dist = df.doubleDistance(relation.get(iter), mean);
+ if (squared) {
+ variance += dist;
+ } else {
+ variance += dist * dist;
+ }
+ }
+ }
+ return variance;
+ } else {
+ double variance = 0.0;
+ for (Cluster<MeanModel<V>> cluster : clusterList) {
+ DBIDs ids = cluster.getIDs();
+ V mean = cluster.getModel().getMean();
+
+ for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double dist = distanceFunction.distance(relation.get(iter), mean).doubleValue();
+ variance += dist * dist;
+ }
+ }
+ return variance;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
new file mode 100644
index 00000000..ed9a528d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
@@ -0,0 +1,4 @@
+/**
+ * Quality measures for k-Means results.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; \ No newline at end of file