summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java310
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java71
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java74
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java49
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java176
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java177
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java213
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java78
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java87
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java26
10 files changed, 1261 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
new file mode 100644
index 00000000..d3c73b53
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -0,0 +1,310 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Abstract base class for k-means implementations.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> {
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater or equal to 0, where 0 means no limit.
+ */
+ public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
+
+ /**
+ * Parameter to specify the random generator seed.
+ */
+ public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
+
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
+
+ /**
+ * Holds the value of {@link #K_ID}.
+ */
+ protected int k;
+
+ /**
+ * Holds the value of {@link #MAXITER_ID}.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMeansInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param relation the database to cluster
+ * @param means a list of k means
+ * @param clusters cluster assignment
+ * @return true when the object was reassigned
+ */
+ protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) {
+ boolean changed = false;
+
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
+ for(DBID id : relation.iterDBIDs()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.doubleDistance(fv, means.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ if(clusters.get(minIndex).add(id)) {
+ changed = true;
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(id)) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ else {
+ final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
+ for(DBID id : relation.iterDBIDs()) {
+ D mindist = df.getDistanceFactory().infiniteDistance();
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ D dist = df.distance(fv, means.get(i));
+ if(dist.compareTo(mindist) < 0) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ if(clusters.get(minIndex).add(id)) {
+ changed = true;
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(id)) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ return changed;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ /**
+ * Returns the mean vectors of the given clusters in the given database.
+ *
+ * @param clusters the clusters to compute the means
+ * @param means the recent means
+ * @param database the database containing the vectors
+ * @return the mean vectors of the given clusters in the given database
+ */
+ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) {
+ List<Vector> newMeans = new ArrayList<Vector>(k);
+ for(int i = 0; i < k; i++) {
+ ModifiableDBIDs list = clusters.get(i);
+ Vector mean = null;
+ for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) {
+ if(mean == null) {
+ mean = database.get(clusterIter.next()).getColumnVector();
+ }
+ else {
+ mean.plusEquals(database.get(clusterIter.next()).getColumnVector());
+ }
+ }
+ if(list.size() > 0) {
+ assert mean != null;
+ mean.timesEquals(1.0 / list.size());
+ }
+ else {
+ mean = means.get(i);
+ }
+ newMeans.add(mean);
+ }
+ return newMeans;
+ }
+
+ /**
+ * Compute an incremental update for the mean
+ *
+ * @param mean Mean to update
+ * @param vec Object vector
+ * @param newsize (New) size of cluster
+ * @param op Cluster size change / Weight change
+ */
+ protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) {
+ if(newsize == 0) {
+ return; // Keep old mean
+ }
+ Vector delta = vec.getColumnVector();
+ // Compute difference from mean
+ delta.minusEquals(mean);
+ delta.timesEquals(op / newsize);
+ mean.plusEquals(delta);
+ }
+
+ /**
+ * Perform a MacQueen style iteration.
+ *
+ * @param relation Relation
+ * @param means Means
+ * @param clusters Clusters
+ * @return true when the means have changed
+ */
+ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
+ boolean changed = false;
+
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ // Raw distance function
+ @SuppressWarnings("unchecked")
+ final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
+
+ // Incremental update
+ for(DBID id : relation.iterDBIDs()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.doubleDistance(fv, means.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ // Update the cluster mean incrementally:
+ for(int i = 0; i < k; i++) {
+ ModifiableDBIDs ci = clusters.get(i);
+ if(i == minIndex) {
+ if(ci.add(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ changed = true;
+ }
+ }
+ else if(ci.remove(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ changed = true;
+ }
+ }
+ }
+ }
+ else {
+ // Raw distance function
+ final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
+
+ // Incremental update
+ for(DBID id : relation.iterDBIDs()) {
+ D mindist = df.getDistanceFactory().infiniteDistance();
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ D dist = df.distance(fv, means.get(i));
+ if(dist.compareTo(mindist) < 0) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ // Update the cluster mean incrementally:
+ for(int i = 0; i < k; i++) {
+ ModifiableDBIDs ci = clusters.get(i);
+ if(i == minIndex) {
+ if(ci.add(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ changed = true;
+ }
+ }
+ else if(ci.remove(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ changed = true;
+ }
+ }
+ }
+ }
+ return changed;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
new file mode 100644
index 00000000..b5f088fb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
@@ -0,0 +1,71 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
+
+/**
+ * Abstract base class for common k-means initializations.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> {
+ /**
+ * Holds the value of {@link KMeansLloyd#SEED_ID}.
+ */
+ protected Long seed;
+
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public AbstractKMeansInitialization(Long seed) {
+ this.seed = seed;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ protected Long seed;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true);
+ if(config.grab(seedP)) {
+ seed = seedP.getValue();
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
new file mode 100644
index 00000000..78ccd426
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
@@ -0,0 +1,74 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Initialize K-means by using the first k objects as initial means.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ */
+ public FirstKInitialMeans() {
+ super(null);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ Iterator<DBID> iter = relation.iterDBIDs();
+ List<Vector> means = new ArrayList<Vector>(k);
+ for(int i = 0; i < k && iter.hasNext(); i++) {
+ means.add(relation.get(iter.next()).getColumnVector());
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ @Override
+ protected FirstKInitialMeans<V> makeInstance() {
+ return new FirstKInitialMeans<V>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
new file mode 100644
index 00000000..f4c0d9c7
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
@@ -0,0 +1,49 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Interface for initializing K-Means
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public interface KMeansInitialization<V extends NumberVector<V, ?>> {
+ /**
+ * Choose initial means
+ *
+ * @param relation Relation
+ * @param k Parameter k
+ * @param distanceFunction Distance function
+ * @return List of chosen means for k-means
+ */
+ public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
new file mode 100644
index 00000000..fda1d6c0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -0,0 +1,176 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-means algorithm, using Lloyd-style bulk iterations.
+ *
+ * <p>
+ * Reference:<br />
+ * S. Lloyd<br/>
+ * Least squares quantization in PCM<br/>
+ * IEEE Transactions on Information Theory 28 (2)<br/>
+ * previously published as Bell Telephone Laboratories Paper
+ * </p>
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+@Title("K-Means")
+@Description("Finds a partitioning into k clusters.")
+@Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489")
+public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMeansLloyd.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Run k-means
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ * @throws IllegalStateException
+ */
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ if(relation.size() <= 0) {
+ return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if(logger.isVerbose()) {
+ logger.verbose("K-Means iteration " + (iteration + 1));
+ }
+ boolean changed = assignToNearestCluster(relation, means, clusters);
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute means.
+ means = means(clusters, means, relation);
+ }
+ // Wrap result
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
+ result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected AbstractKMeans<V, D> makeInstance() {
+ return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
new file mode 100644
index 00000000..56492dd0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -0,0 +1,177 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-means algorithm, using MacQueen style incremental updates.
+ *
+ * <p>
+ * Reference:<br />
+ * J. MacQueen: Some Methods for Classification and Analysis of Multivariate
+ * Observations. <br />
+ * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297.
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector type to use
+ * @param <D> distance function value type
+ */
+@Title("K-Means")
+@Description("Finds a partitioning into k clusters.")
+@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992")
+public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMeansMacQueen.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Run k-means
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ * @throws IllegalStateException
+ */
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ if(relation.size() <= 0) {
+ return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ // Initialize cluster and assign objects
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+ assignToNearestCluster(relation, means, clusters);
+ // Initial recomputation of the means.
+ means = means(clusters, means, relation);
+
+ // Refine result
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if(logger.isVerbose()) {
+ logger.verbose("K-Means iteration " + (iteration + 1));
+ }
+ boolean changed = macQueenIterate(relation, means, clusters);
+ if(!changed) {
+ break;
+ }
+ }
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
+ result.addCluster(new Cluster<MeanModel<V>>(ids, model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected AbstractKMeans<V, D> makeInstance() {
+ return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
new file mode 100644
index 00000000..c7a2fa1d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -0,0 +1,213 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+
+/**
+ * K-Means++ initialization for k-means.
+ *
+ * Reference:
+ * <p>
+ * D. Arthur, S. Vassilvitskii<br />
+ * k-means++: the advantages of careful seeding<br />
+ * In: Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms,
+ * SODA 2007
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494")
+public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public KMeansPlusPlusInitialMeans(Long seed) {
+ super(seed);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ // Get a distance query
+ if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("K-Means++ initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
+ DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
+
+ // Chose first mean
+ List<Vector> means = new ArrayList<Vector>(k);
+
+ Random random = (seed != null) ? new Random(seed) : new Random();
+ DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next();
+ means.add(relation.get(first).getColumnVector());
+
+ ModifiableDBIDs chosen = DBIDUtil.newHashSet(k);
+ chosen.add(first);
+ ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
+ // Initialize weights
+ double[] weights = new double[ids.size()];
+ double weightsum = initialWeights(weights, ids, first, distQ);
+ while(means.size() < k) {
+ if(weightsum > Double.MAX_VALUE) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?");
+ }
+ if(weightsum < Double.MIN_NORMAL) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?");
+ }
+ double r = random.nextDouble() * weightsum;
+ int pos = 0;
+ while(r > 0 && pos < weights.length) {
+ r -= weights[pos];
+ pos++;
+ }
+ // Add new mean:
+ DBID newmean = ids.get(pos);
+ means.add(relation.get(newmean).getColumnVector());
+ chosen.add(newmean);
+ // Update weights:
+ weights[pos] = 0.0;
+ // Choose optimized version for double distances, if applicable.
+ if (distF instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF;
+ weightsum = updateWeights(weights, ids, newmean, ddist, relation);
+ } else {
+ weightsum = updateWeights(weights, ids, newmean, distQ);
+ }
+ }
+
+ return means;
+ }
+
+ /**
+ * Initialize the weight list.
+ *
+ * @param weights Weight list
+ * @param ids IDs
+ * @param latest Added ID
+ * @param distQ Distance query
+ * @return Weight sum
+ */
+ protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ double weightsum = 0.0;
+ DBIDIter it = ids.iter();
+ for(int i = 0; i < weights.length; i++, it.advance()) {
+ DBID id = it.getDBID();
+ if(latest.equals(id)) {
+ weights[i] = 0.0;
+ }
+ else {
+ double d = distQ.distance(latest, id).doubleValue();
+ weights[i] = d * d;
+ }
+ weightsum += weights[i];
+ }
+ return weightsum;
+ }
+
+ /**
+ * Update the weight list.
+ *
+ * @param weights Weight list
+ * @param ids IDs
+ * @param latest Added ID
+ * @param distQ Distance query
+ * @return Weight sum
+ */
+ protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ double weightsum = 0.0;
+ DBIDIter it = ids.iter();
+ for(int i = 0; i < weights.length; i++, it.advance()) {
+ DBID id = it.getDBID();
+ if(weights[i] > 0.0) {
+ double d = distQ.distance(latest, id).doubleValue();
+ weights[i] = Math.min(weights[i], d * d);
+ weightsum += weights[i];
+ }
+ }
+ return weightsum;
+ }
+
+ /**
+ * Update the weight list.
+ *
+ * @param weights Weight list
+ * @param ids IDs
+ * @param latest Added ID
+ * @param distF Distance function
+ * @return Weight sum
+ */
+ protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, PrimitiveDoubleDistanceFunction<V> distF, Relation<V> rel) {
+ final V lv = rel.get(latest);
+ double weightsum = 0.0;
+ DBIDIter it = ids.iter();
+ for(int i = 0; i < weights.length; i++, it.advance()) {
+ DBID id = it.getDBID();
+ if(weights[i] > 0.0) {
+ double d = distF.doubleDistance(lv, rel.get(id));
+ weights[i] = Math.min(weights[i], d * d);
+ weightsum += weights[i];
+ }
+ }
+ return weightsum;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ @Override
+ protected KMeansPlusPlusInitialMeans<V, D> makeInstance() {
+ return new KMeansPlusPlusInitialMeans<V, D>(seed);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
new file mode 100644
index 00000000..30e59453
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
@@ -0,0 +1,78 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Initialize K-means by randomly choosing k exsiting elements as cluster
+ * centers.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public RandomlyChosenInitialMeans(Long seed) {
+ super(seed);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed);
+ List<Vector> means = new ArrayList<Vector>(k);
+ for(DBID id : ids) {
+ means.add(relation.get(id).getColumnVector());
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+
+ @Override
+ protected RandomlyChosenInitialMeans<V> makeInstance() {
+ return new RandomlyChosenInitialMeans<V>(seed);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
new file mode 100644
index 00000000..e8a466dd
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -0,0 +1,87 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * Initialize k-means by generating random vectors (within the data sets value
+ * range).
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public RandomlyGeneratedInitialMeans(Long seed) {
+ super(seed);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ final int dim = DatabaseUtil.dimensionality(relation);
+ Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
+ List<Vector> means = new ArrayList<Vector>(k);
+ final Random random = (this.seed != null) ? new Random(this.seed) : new Random();
+ for(int i = 0; i < k; i++) {
+ double[] r = MathUtil.randomDoubleArray(dim, random);
+ // Rescale
+ for(int d = 0; d < dim; d++) {
+ r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d];
+ }
+ means.add(new Vector(r));
+ }
+ return means;
+ }
+
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+
+ @Override
+ protected RandomlyGeneratedInitialMeans<V> makeInstance() {
+ return new RandomlyGeneratedInitialMeans<V>(seed);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
new file mode 100644
index 00000000..2ce625b0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>K-means clustering and variations.</p>
+ */
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2012
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; \ No newline at end of file