summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java168
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java32
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java55
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java84
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java172
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java271
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java45
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java310
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java187
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java9
15 files changed, 1244 insertions, 146 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index d3c73b53..92862909 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -1,24 +1,5 @@
package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
-import de.lmu.ifi.dbs.elki.data.Clustering;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
-import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
@@ -42,37 +23,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+
/**
* Abstract base class for k-means implementations.
*
* @author Erich Schubert
*
+ * @apiviz.composedOf KMeansInitialization
+ *
* @param <V> Vector type
* @param <D> Distance type
*/
-public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> {
- /**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater than 0.
- */
- public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
-
- /**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater or equal to 0, where 0 means no limit.
- */
- public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
-
- /**
- * Parameter to specify the random generator seed.
- */
- public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
-
- /**
- * Parameter to specify the initialization method
- */
- public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
-
+public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> implements KMeans {
/**
* Holds the value of {@link #K_ID}.
*/
@@ -94,6 +77,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
* @param distanceFunction distance function
* @param k k parameter
* @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
*/
public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
super(distanceFunction);
@@ -111,15 +95,15 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
* @param clusters cluster assignment
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) {
+ protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?, ?>> means, List<? extends ModifiableDBIDs> clusters) {
boolean changed = false;
if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
@@ -128,13 +112,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
mindist = dist;
}
}
- if(clusters.get(minIndex).add(id)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
for(int i = 0; i < k; i++) {
if(i != minIndex) {
- if(clusters.get(i).remove(id)) {
+ if(clusters.get(i).remove(iditer)) {
break;
}
}
@@ -144,9 +128,9 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
}
else {
final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
@@ -155,13 +139,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
mindist = dist;
}
}
- if(clusters.get(minIndex).add(id)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
for(int i = 0; i < k; i++) {
if(i != minIndex) {
- if(clusters.get(i).remove(id)) {
+ if(clusters.get(i).remove(iditer)) {
break;
}
}
@@ -185,25 +169,23 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
* @param database the database containing the vectors
* @return the mean vectors of the given clusters in the given database
*/
- protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) {
+ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> means, Relation<V> database) {
List<Vector> newMeans = new ArrayList<Vector>(k);
for(int i = 0; i < k; i++) {
ModifiableDBIDs list = clusters.get(i);
Vector mean = null;
- for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) {
- if(mean == null) {
- mean = database.get(clusterIter.next()).getColumnVector();
- }
- else {
- mean.plusEquals(database.get(clusterIter.next()).getColumnVector());
- }
- }
if(list.size() > 0) {
- assert mean != null;
- mean.timesEquals(1.0 / list.size());
+ double s = 1.0 / list.size();
+ DBIDIter iter = list.iter();
+ assert (iter.valid());
+ mean = database.get(iter).getColumnVector().timesEquals(s);
+ iter.advance();
+ for(; iter.valid(); iter.advance()) {
+ mean.plusTimesEquals(database.get(iter).getColumnVector(), s);
+ }
}
else {
- mean = means.get(i);
+ mean = means.get(i).getColumnVector();
}
newMeans.add(mean);
}
@@ -211,6 +193,36 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
}
/**
+ * Returns the median vectors of the given clusters in the given database.
+ *
+ * @param clusters the clusters to compute the means
+ * @param medians the recent medians
+ * @param database the database containing the vectors
+ * @return the mean vectors of the given clusters in the given database
+ */
+ protected List<NumberVector<?, ?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> medians, Relation<V> database) {
+ final int dim = medians.get(0).getDimensionality();
+ final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database);
+ List<NumberVector<?, ?>> newMedians = new ArrayList<NumberVector<?, ?>>(k);
+ for(int i = 0; i < k; i++) {
+ ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i));
+ if(list.size() > 0) {
+ Vector mean = new Vector(dim);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d + 1);
+ DBID id = QuickSelect.median(list, sorter);
+ mean.set(d, database.get(id).doubleValue(d + 1));
+ }
+ newMedians.add(mean);
+ }
+ else {
+ newMedians.add((NumberVector<?, ?>) medians.get(i));
+ }
+ }
+ return newMedians;
+ }
+
+ /**
* Compute an incremental update for the mean
*
* @param mean Mean to update
@@ -239,16 +251,16 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
*/
protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
boolean changed = false;
-
+
if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
// Raw distance function
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
-
+
// Incremental update
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
@@ -261,13 +273,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
for(int i = 0; i < k; i++) {
ModifiableDBIDs ci = clusters.get(i);
if(i == minIndex) {
- if(ci.add(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ if(ci.add(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
changed = true;
}
}
- else if(ci.remove(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ else if(ci.remove(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
changed = true;
}
}
@@ -276,11 +288,11 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
else {
// Raw distance function
final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
-
+
// Incremental update
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
@@ -293,13 +305,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
for(int i = 0; i < k; i++) {
ModifiableDBIDs ci = clusters.get(i);
if(i == minIndex) {
- if(ci.add(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ if(ci.add(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
changed = true;
}
}
- else if(ci.remove(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ else if(ci.remove(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
changed = true;
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
index b5f088fb..a8effecd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
@@ -22,7 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
@@ -34,9 +33,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
*
* @param <V> Vector type
*/
-public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> {
+public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> {
/**
- * Holds the value of {@link KMeansLloyd#SEED_ID}.
+ * Holds the value of {@link KMeans#SEED_ID}.
*/
protected Long seed;
@@ -56,13 +55,13 @@ public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>>
*
* @apiviz.exclude
*/
- public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ public abstract static class Parameterizer<V> extends AbstractParameterizer {
protected Long seed;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true);
+ LongParameter seedP = new LongParameter(KMeans.SEED_ID, true);
if(config.grab(seedP)) {
seed = seedP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
index 78ccd426..7a7f2867 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
@@ -23,14 +23,16 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
@@ -40,20 +42,30 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @param <V> Vector type
*/
-public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsInitialization<V> {
/**
* Constructor.
*/
public FirstKInitialMeans() {
- super(null);
+ super();
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
- Iterator<DBID> iter = relation.iterDBIDs();
- List<Vector> means = new ArrayList<Vector>(k);
- for(int i = 0; i < k && iter.hasNext(); i++) {
- means.add(relation.get(iter.next()).getColumnVector());
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ DBIDIter iter = relation.iterDBIDs();
+ List<V> means = new ArrayList<V>(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(relation.get(iter));
+ }
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) {
+ DBIDIter iter = distanceFunction.getRelation().iterDBIDs();
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(iter);
}
return means;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
new file mode 100644
index 00000000..37171d4a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
@@ -0,0 +1,55 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Some constants and options shared among kmeans family algorithms.
+ *
+ * @author Erich Schubert
+ */
+public interface KMeans {
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
+
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater or equal to 0, where 0 means no limit.
+ */
+ public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
+
+ /**
+ * Parameter to specify the random generator seed.
+ */
+ public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
index f4c0d9c7..9e5d69f0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
@@ -24,19 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
*/
import java.util.List;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
/**
* Interface for initializing K-Means
*
* @author Erich Schubert
*
- * @param <V> Vector type
+ * @param <V> Object type
*/
-public interface KMeansInitialization<V extends NumberVector<V, ?>> {
+public interface KMeansInitialization<V> {
/**
* Choose initial means
*
@@ -45,5 +43,5 @@ public interface KMeansInitialization<V extends NumberVector<V, ?>> {
* @param distanceFunction Distance function
* @return List of chosen means for k-means
*/
- public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
+ public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index fda1d6c0..b1b40632 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -39,7 +39,6 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
@@ -94,14 +93,13 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex
* @param database Database
* @param relation relation to use
* @return result
- * @throws IllegalStateException
*/
- public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
if(relation.size() <= 0) {
return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ List<? extends NumberVector<?, ?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
for(int i = 0; i < k; i++) {
@@ -124,7 +122,7 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex
final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
for(int i = 0; i < clusters.size(); i++) {
- MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
}
return result;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index 56492dd0..c729eb10 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -93,15 +93,17 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>>
*
* @param database Database
* @param relation relation to use
- * @return result
- * @throws IllegalStateException
+ * @return Clustering result
*/
- public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
if(relation.size() <= 0) {
return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ List<Vector> means = new ArrayList<Vector>(k);
+ for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) {
+ means.add(nv.getColumnVector());
+ }
// Initialize cluster and assign objects
List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
for(int i = 0; i < k; i++) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
index c7a2fa1d..9afeff6c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -26,19 +26,18 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Random;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -59,7 +58,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
* @param <D> Distance type
*/
@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494")
-public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> {
+public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
/**
* Constructor.
*
@@ -70,7 +69,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
// Get a distance query
if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("K-Means++ initialization can only be used with numerical distances.");
@@ -80,14 +79,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
// Chose first mean
- List<Vector> means = new ArrayList<Vector>(k);
+ List<V> means = new ArrayList<V>(k);
Random random = (seed != null) ? new Random(seed) : new Random();
- DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next();
- means.add(relation.get(first).getColumnVector());
+ DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iter().getDBID();
+ means.add(relation.get(first));
- ModifiableDBIDs chosen = DBIDUtil.newHashSet(k);
- chosen.add(first);
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
// Initialize weights
double[] weights = new double[ids.size()];
@@ -107,16 +104,16 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
}
// Add new mean:
DBID newmean = ids.get(pos);
- means.add(relation.get(newmean).getColumnVector());
- chosen.add(newmean);
+ means.add(relation.get(newmean));
// Update weights:
weights[pos] = 0.0;
// Choose optimized version for double distances, if applicable.
- if (distF instanceof PrimitiveDoubleDistanceFunction) {
+ if(distF instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF;
weightsum = updateWeights(weights, ids, newmean, ddist, relation);
- } else {
+ }
+ else {
weightsum = updateWeights(weights, ids, newmean, distQ);
}
}
@@ -124,6 +121,48 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
return means;
}
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
+ if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("PAM initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
+ // Chose first mean
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ Random random = (seed != null) ? new Random(seed) : new Random();
+ DBID first = DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random.nextLong()).iter().getDBID();
+ means.add(first);
+
+ ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs());
+ // Initialize weights
+ double[] weights = new double[ids.size()];
+ double weightsum = initialWeights(weights, ids, first, distQ);
+ while(means.size() < k) {
+ if(weightsum > Double.MAX_VALUE) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?");
+ }
+ if(weightsum < Double.MIN_NORMAL) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?");
+ }
+ double r = random.nextDouble() * weightsum;
+ int pos = 0;
+ while(r > 0 && pos < weights.length) {
+ r -= weights[pos];
+ pos++;
+ }
+ // Add new mean:
+ DBID newmean = ids.get(pos);
+ means.add(newmean);
+ // Update weights:
+ weights[pos] = 0.0;
+ weightsum = updateWeights(weights, ids, newmean, distQ);
+ }
+
+ return means;
+ }
+
/**
* Initialize the weight list.
*
@@ -133,16 +172,15 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
* @param distQ Distance query
* @return Weight sum
*/
- protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) {
double weightsum = 0.0;
DBIDIter it = ids.iter();
for(int i = 0; i < weights.length; i++, it.advance()) {
- DBID id = it.getDBID();
- if(latest.equals(id)) {
+ if(latest.sameDBID(it)) {
weights[i] = 0.0;
}
else {
- double d = distQ.distance(latest, id).doubleValue();
+ double d = distQ.distance(latest, it).doubleValue();
weights[i] = d * d;
}
weightsum += weights[i];
@@ -159,13 +197,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
* @param distQ Distance query
* @return Weight sum
*/
- protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) {
double weightsum = 0.0;
DBIDIter it = ids.iter();
for(int i = 0; i < weights.length; i++, it.advance()) {
- DBID id = it.getDBID();
if(weights[i] > 0.0) {
- double d = distQ.distance(latest, id).doubleValue();
+ double d = distQ.distance(latest, it).doubleValue();
weights[i] = Math.min(weights[i], d * d);
weightsum += weights[i];
}
@@ -187,9 +224,8 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
double weightsum = 0.0;
DBIDIter it = ids.iter();
for(int i = 0; i < weights.length; i++, it.advance()) {
- DBID id = it.getDBID();
if(weights[i] > 0.0) {
- double d = distF.doubleDistance(lv, rel.get(id));
+ double d = distF.doubleDistance(lv, rel.get(it));
weights[i] = Math.min(weights[i], d * d);
weightsum += weights[i];
}
@@ -204,7 +240,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
@Override
protected KMeansPlusPlusInitialMeans<V, D> makeInstance() {
return new KMeansPlusPlusInitialMeans<V, D>(seed);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
new file mode 100644
index 00000000..8c284981
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -0,0 +1,172 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-medians clustering algorithm, using Lloyd-style bulk
+ * iterations.
+ *
+ * Reference:
+ * <p>
+ * Clustering via Concave Minimization<br />
+ * P. S. Bradley, O. L. Mangasarian, W. N. Street<br />
+ * in: Advances in neural information processing systems
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+@Title("K-Medians")
+@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url="http://nips.djvuzone.org/djvu/nips09/0368.djvu")
+public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMediansLloyd.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Run k-medians
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering");
+ }
+ // Choose initial medians
+ List<? extends NumberVector<?, ?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if(logger.isVerbose()) {
+ logger.verbose("K-Medians iteration " + (iteration + 1));
+ }
+ boolean changed = assignToNearestCluster(relation, medians, clusters);
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute medians.
+ medians = medians(clusters, medians, relation);
+ }
+ // Wrap result
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef()));
+ result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected AbstractKMeans<V, D> makeInstance() {
+ return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
new file mode 100644
index 00000000..a5c3d675
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -0,0 +1,271 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-medoids clustering algorithm, using a "bulk" variation of the
+ * "Partitioning Around Medoids" approach.
+ *
+ * In contrast to PAM, which will in each iteration update one medoid with one
+ * (arbitrary) non-medoid, this implementation follows the EM pattern. In the
+ * expectation step, the best medoid from the cluster members is chosen; in the
+ * M-step, the objects are reassigned to their nearest medoid.
+ *
+ * We do not have a reference for this algorithm. It borrows ideas from EM and
+ * PAM. If needed, you are welcome cite it using the latest ELKI publication
+ * (this variation is likely not worth publishing on its own).
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MedoidModel
+ * @apiviz.composedOf KMedoidsInitialization
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMedoidsEM.class);
+
+ /**
+ * Holds the value of {@link AbstractKMeans#K_ID}.
+ */
+ protected int k;
+
+ /**
+ * Holds the value of {@link AbstractKMeans#MAXITER_ID}.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMedoidsInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
+ */
+ public KMedoidsEM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Run k-medoids
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ }
+ DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ // Choose initial medoids
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+ Mean[] mdists = Mean.newArray(k);
+
+ // Initial assignment to nearest medoids
+ // TODO: reuse this information, from the build phase, when possible?
+ assignToNearestCluster(medoids, mdists, clusters, distQ);
+
+ // Swap phase
+ boolean changed = true;
+ while(changed) {
+ changed = false;
+ // Try to swap the medoid with a better cluster member:
+ for(int i = 0; i < k; i++) {
+ DBID med = medoids.get(i);
+ DBID best = null;
+ Mean bestm = mdists[i];
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(med.sameDBID(iter)) {
+ continue;
+ }
+ Mean mdist = new Mean();
+ for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
+ mdist.put(distQ.distance(iter, iter2).doubleValue());
+ }
+ if(mdist.getMean() < bestm.getMean()) {
+ best = iter.getDBID();
+ bestm = mdist;
+ }
+ }
+ if(best != null && !med.sameDBID(best)) {
+ changed = true;
+ medoids.set(i, best);
+ mdists[i] = bestm;
+ }
+ }
+ // Reassign
+ if(changed) {
+ assignToNearestCluster(medoids, mdists, clusters, distQ);
+ }
+ }
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MedoidModel model = new MedoidModel(medoids.get(i));
+ result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param means a list of k means
+ * @param mdist Mean distances
+ * @param clusters cluster assignment
+ * @param distQ distance query
+ * @return true when the object was reassigned
+ */
+ protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ boolean changed = false;
+
+ double[] dists = new double[k];
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ int minIndex = 0;
+ double mindist = Double.POSITIVE_INFINITY;
+ for(int i = 0; i < k; i++) {
+ dists[i] = distQ.distance(iditer, means.get(i)).doubleValue();
+ if(dists[i] < mindist) {
+ minIndex = i;
+ mindist = dists[i];
+ }
+ }
+ if(clusters.get(minIndex).add(iditer)) {
+ changed = true;
+ mdist[minIndex].put(mindist);
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
+ mdist[minIndex].put(dists[i], -1);
+ break;
+ }
+ }
+ }
+ }
+ }
+ return changed;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMedoidsInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected KMedoidsEM<V, D> makeInstance() {
+ return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
new file mode 100644
index 00000000..269e7e9e
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
@@ -0,0 +1,45 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+
+/**
+ * Interface for initializing K-Medoids. In contrast to k-means initializers,
+ * this initialization will only return members of the original data set.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Object type
+ */
+public interface KMedoidsInitialization<V> {
+ /**
+ * Choose initial means
+ *
+ * @param k Parameter k
+ * @param distanceFunction Distance function
+ * @return List of chosen means for k-means
+ */
+ public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
new file mode 100644
index 00000000..30c80084
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -0,0 +1,310 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-medoids clustering algorithm, using the
+ * "Partitioning Around Medoids" approach.
+ *
+ * Reference:
+ * <p>
+ * Clustering my means of Medoids<br />
+ * Kaufman, L. and Rousseeuw, P.J.<br />
+ * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MedoidModel
+ * @apiviz.composedOf KMedoidsInitialization
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+@Title("Partioning Around Medoids")
+@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
+public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMedoidsPAM.class);
+
+ /**
+ * Holds the value of {@link AbstractKMeans#K_ID}.
+ */
+ protected int k;
+
+ /**
+ * Holds the value of {@link AbstractKMeans#MAXITER_ID}.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMedoidsInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
+ */
+ public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Run k-medoids
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ }
+ DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ DBIDs ids = relation.getDBIDs();
+ // Choose initial medoids
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+
+ WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ // Initial assignment to nearest medoids
+ // TODO: reuse this information, from the build phase, when possible?
+ assignToNearestCluster(medoids, ids, second, clusters, distQ);
+
+ // Swap phase
+ boolean changed = true;
+ while(changed) {
+ changed = false;
+ // Try to swap the medoid with a better cluster member:
+ double best = 0;
+ DBID bestid = null;
+ int bestcluster = -1;
+ for(int i = 0; i < k; i++) {
+ DBID med = medoids.get(i);
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(med.sameDBID(iter)) {
+ continue;
+ }
+ // double disti = distQ.distance(id, med).doubleValue();
+ double cost = 0;
+ for(int j = 0; j < k; j++) {
+ for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
+ double distcur = distQ.distance(iter2, medoids.get(j)).doubleValue();
+ double distnew = distQ.distance(iter2, iter).doubleValue();
+ if(j == i) {
+ // Cases 1 and 2.
+ double distsec = second.doubleValue(iter2);
+ if(distcur > distsec) {
+ // Case 1, other would switch to a third medoid
+ cost += distsec - distcur; // Always positive!
+ }
+ else { // Would remain with the candidate
+ cost += distnew - distcur; // Could be negative
+ }
+ }
+ else {
+ // Cases 3-4: objects from other clusters
+ if (distcur < distnew) {
+ // Case 3: no change
+ } else {
+ // Case 4: would switch to new medoid
+ cost += distnew - distcur; // Always negative
+ }
+ }
+ }
+ }
+ if (cost < best) {
+ best = cost;
+ bestid = iter.getDBID();
+ bestcluster = i;
+ }
+ }
+ }
+ if(logger.isDebugging()) {
+ logger.debug("Best cost: " + best);
+ }
+ if(bestid != null) {
+ changed = true;
+ medoids.set(bestcluster, bestid);
+ }
+ // Reassign
+ if(changed) {
+ // TODO: can we save some of these recomputations?
+ assignToNearestCluster(medoids, ids, second, clusters, distQ);
+ }
+ }
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MedoidModel model = new MedoidModel(medoids.get(i));
+ result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param means Object centroids
+ * @param ids Object ids
+ * @param second Distance to second nearest medoid
+ * @param clusters cluster assignment
+ * @param distQ distance query
+ * @return true when any object was reassigned
+ */
+ protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ boolean changed = false;
+
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ int minIndex = 0;
+ double mindist = Double.POSITIVE_INFINITY;
+ double mindist2 = Double.POSITIVE_INFINITY;
+ for(int i = 0; i < k; i++) {
+ double dist = distQ.distance(iditer, means.get(i)).doubleValue();
+ if(dist < mindist) {
+ minIndex = i;
+ mindist2 = mindist;
+ mindist = dist;
+ }
+ else if(dist < mindist2) {
+ mindist2 = dist;
+ }
+ }
+ if(clusters.get(minIndex).add(iditer)) {
+ changed = true;
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
+ break;
+ }
+ }
+ }
+ }
+ second.put(iditer, mindist2);
+ }
+ return changed;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMedoidsInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected KMedoidsPAM<V, D> makeInstance() {
+ return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
new file mode 100644
index 00000000..094c37bb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
@@ -0,0 +1,187 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * PAM initialization for k-means (and of course, PAM).
+ *
+ * Reference:
+ * <p>
+ * Clustering my means of Medoids<br />
+ * Kaufman, L. and Rousseeuw, P.J.<br />
+ * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
+ * </p>
+ *
+ * TODO: enforce using a distance matrix?
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
+public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMeansInitialization<V>, KMedoidsInitialization<V> {
+ /**
+ * Constructor.
+ */
+ public PAMInitialMeans() {
+ super();
+ }
+
+ @Override
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ // Get a distance query
+ if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("PAM initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
+ final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
+ DBIDs medids = chooseInitialMedoids(k, distQ);
+ List<V> medoids = new ArrayList<V>(k);
+ for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) {
+ medoids.add(relation.get(iter));
+ }
+ return medoids;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
+ if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("PAM initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
+ final DBIDs ids = distQ.getRelation().getDBIDs();
+
+ ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
+ double best = Double.POSITIVE_INFINITY;
+ Mean mean = new Mean(); // Mean is numerically more stable than sum.
+ WritableDoubleDataStore mindist = null;
+
+ // First mean is chosen by having the smallest distance sum to all others.
+ {
+ DBID bestid = null;
+ WritableDoubleDataStore bestd = null;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ mean.reset();
+ for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ double d = distQ.distance(iter, iter2).doubleValue();
+ mean.put(d);
+ newd.putDouble(iter2, d);
+ }
+ if(mean.getMean() < best) {
+ best = mean.getMean();
+ bestid = iter.getDBID();
+ if(bestd != null) {
+ bestd.destroy();
+ }
+ bestd = newd;
+ }
+ else {
+ newd.destroy();
+ }
+ }
+ medids.add(bestid);
+ mindist = bestd;
+ }
+ assert (mindist != null);
+
+ // Subsequent means optimize the full criterion.
+ for(int i = 1; i < k; i++) {
+ DBID bestid = null;
+ WritableDoubleDataStore bestd = null;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ DBID id = iter.getDBID();
+ if(medids.contains(id)) {
+ continue;
+ }
+ WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ mean.reset();
+ for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ DBID other = iter2.getDBID();
+ double dn = distQ.distance(id, other).doubleValue();
+ double v = Math.min(dn, mindist.doubleValue(other));
+ mean.put(v);
+ newd.put(other, v);
+ }
+ assert (mean.getCount() == ids.size());
+ if(mean.getMean() < best) {
+ best = mean.getMean();
+ bestid = id;
+ if(bestd != null) {
+ bestd.destroy();
+ }
+ bestd = newd;
+ }
+ else {
+ newd.destroy();
+ }
+ }
+ if(bestid == null) {
+ throw new AbortException("No median found that improves the criterion function?!?");
+ }
+ medids.add(bestid);
+ mindist.destroy();
+ mindist = bestd;
+ }
+
+ mindist.destroy();
+ return medids;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ @Override
+ protected PAMInitialMeans<V, D> makeInstance() {
+ return new PAMInitialMeans<V, D>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
index 30e59453..5b9da923 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
@@ -25,13 +25,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
/**
* Initialize K-means by randomly choosing k exsiting elements as cluster
@@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
*
* @param <V> Vector type
*/
-public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
/**
* Constructor.
*
@@ -52,15 +51,20 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed);
- List<Vector> means = new ArrayList<Vector>(k);
- for(DBID id : ids) {
- means.add(relation.get(id).getColumnVector());
+ List<V> means = new ArrayList<V>(k);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ means.add(relation.get(iter));
}
return means;
}
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) {
+ return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, seed);
+ }
+
/**
* Parameterization class.
*
@@ -68,7 +72,7 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> {
@Override
protected RandomlyChosenInitialMeans<V> makeInstance() {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
index e8a466dd..00ed08c4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -30,7 +30,6 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.math.MathUtil;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@@ -53,10 +52,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
final int dim = DatabaseUtil.dimensionality(relation);
Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
- List<Vector> means = new ArrayList<Vector>(k);
+ List<V> means = new ArrayList<V>(k);
final Random random = (this.seed != null) ? new Random(this.seed) : new Random();
for(int i = 0; i < k; i++) {
double[] r = MathUtil.randomDoubleArray(dim, random);
@@ -64,12 +63,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends
for(int d = 0; d < dim; d++) {
r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d];
}
- means.add(new Vector(r));
+ means.add(minmax.first.newNumberVector(r));
}
return means;
}
-
/**
* Parameterization class.
*
@@ -78,7 +76,6 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends
* @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
-
@Override
protected RandomlyGeneratedInitialMeans<V> makeInstance() {
return new RandomlyGeneratedInitialMeans<V>(seed);