diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
15 files changed, 1244 insertions, 146 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index d3c73b53..92862909 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -1,24 +1,5 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -42,37 +23,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; + /** * Abstract base class for k-means implementations. * * @author Erich Schubert * + * @apiviz.composedOf KMeansInitialization + * * @param <V> Vector type * @param <D> Distance type */ -public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> { - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater or equal to 0, where 0 means no limit. - */ - public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); - - /** - * Parameter to specify the random generator seed. - */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); - - /** - * Parameter to specify the initialization method - */ - public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); - +public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> implements KMeans { /** * Holds the value of {@link #K_ID}. */ @@ -94,6 +77,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means */ public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { super(distanceFunction); @@ -111,15 +95,15 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param clusters cluster assignment * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) { + protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?, ?>> means, List<? extends ModifiableDBIDs> clusters) { boolean changed = false; if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); @@ -128,13 +112,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis mindist = dist; } } - if(clusters.get(minIndex).add(id)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? for(int i = 0; i < k; i++) { if(i != minIndex) { - if(clusters.get(i).remove(id)) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -144,9 +128,9 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } else { final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); @@ -155,13 +139,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis mindist = dist; } } - if(clusters.get(minIndex).add(id)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? for(int i = 0; i < k; i++) { if(i != minIndex) { - if(clusters.get(i).remove(id)) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -185,25 +169,23 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) { + protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> means, Relation<V> database) { List<Vector> newMeans = new ArrayList<Vector>(k); for(int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) { - if(mean == null) { - mean = database.get(clusterIter.next()).getColumnVector(); - } - else { - mean.plusEquals(database.get(clusterIter.next()).getColumnVector()); - } - } if(list.size() > 0) { - assert mean != null; - mean.timesEquals(1.0 / list.size()); + double s = 1.0 / list.size(); + DBIDIter iter = list.iter(); + assert (iter.valid()); + mean = database.get(iter).getColumnVector().timesEquals(s); + iter.advance(); + for(; iter.valid(); iter.advance()) { + mean.plusTimesEquals(database.get(iter).getColumnVector(), s); + } } else { - mean = means.get(i); + mean = means.get(i).getColumnVector(); } newMeans.add(mean); } @@ -211,6 +193,36 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } /** + * Returns the median vectors of the given clusters in the given database. + * + * @param clusters the clusters to compute the means + * @param medians the recent medians + * @param database the database containing the vectors + * @return the mean vectors of the given clusters in the given database + */ + protected List<NumberVector<?, ?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> medians, Relation<V> database) { + final int dim = medians.get(0).getDimensionality(); + final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); + List<NumberVector<?, ?>> newMedians = new ArrayList<NumberVector<?, ?>>(k); + for(int i = 0; i < k; i++) { + ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); + if(list.size() > 0) { + Vector mean = new Vector(dim); + for(int d = 0; d < dim; d++) { + sorter.setDimension(d + 1); + DBID id = QuickSelect.median(list, sorter); + mean.set(d, database.get(id).doubleValue(d + 1)); + } + newMedians.add(mean); + } + else { + newMedians.add((NumberVector<?, ?>) medians.get(i)); + } + } + return newMedians; + } + + /** * Compute an incremental update for the mean * * @param mean Mean to update @@ -239,16 +251,16 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis */ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { boolean changed = false; - + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); - + // Incremental update - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); @@ -261,13 +273,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis for(int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); if(i == minIndex) { - if(ci.add(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + if(ci.add(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } } - else if(ci.remove(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + else if(ci.remove(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } @@ -276,11 +288,11 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis else { // Raw distance function final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); - + // Incremental update - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); @@ -293,13 +305,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis for(int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); if(i == minIndex) { - if(ci.add(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + if(ci.add(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } } - else if(ci.remove(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + else if(ci.remove(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java index b5f088fb..a8effecd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -22,7 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; @@ -34,9 +33,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; * * @param <V> Vector type */ -public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> { +public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> { /** - * Holds the value of {@link KMeansLloyd#SEED_ID}. + * Holds the value of {@link KMeans#SEED_ID}. */ protected Long seed; @@ -56,13 +55,13 @@ public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> * * @apiviz.exclude */ - public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public abstract static class Parameterizer<V> extends AbstractParameterizer { protected Long seed; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true); + LongParameter seedP = new LongParameter(KMeans.SEED_ID, true); if(config.grab(seedP)) { seed = seedP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java index 78ccd426..7a7f2867 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -23,14 +23,16 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; along with this program. If not, see <http://www.gnu.org/licenses/>. */ import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** @@ -40,20 +42,30 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; * * @param <V> Vector type */ -public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { +public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsInitialization<V> { /** * Constructor. */ public FirstKInitialMeans() { - super(null); + super(); } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { - Iterator<DBID> iter = relation.iterDBIDs(); - List<Vector> means = new ArrayList<Vector>(k); - for(int i = 0; i < k && iter.hasNext(); i++) { - means.add(relation.get(iter.next()).getColumnVector()); + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + DBIDIter iter = relation.iterDBIDs(); + List<V> means = new ArrayList<V>(k); + for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { + means.add(relation.get(iter)); + } + return means; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { + DBIDIter iter = distanceFunction.getRelation().iterDBIDs(); + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { + means.add(iter); } return means; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java new file mode 100644 index 00000000..37171d4a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -0,0 +1,55 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Some constants and options shared among kmeans family algorithms. + * + * @author Erich Schubert + */ +public interface KMeans { + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater or equal to 0, where 0 means no limit. + */ + public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); + + /** + * Parameter to specify the random generator seed. + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java index f4c0d9c7..9e5d69f0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -24,19 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.List; -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** * Interface for initializing K-Means * * @author Erich Schubert * - * @param <V> Vector type + * @param <V> Object type */ -public interface KMeansInitialization<V extends NumberVector<V, ?>> { +public interface KMeansInitialization<V> { /** * Choose initial means * @@ -45,5 +43,5 @@ public interface KMeansInitialization<V extends NumberVector<V, ?>> { * @param distanceFunction Distance function * @return List of chosen means for k-means */ - public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); + public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index fda1d6c0..b1b40632 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -39,7 +39,6 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -94,14 +93,13 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex * @param database Database * @param relation relation to use * @return result - * @throws IllegalStateException */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { if(relation.size() <= 0) { return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?, ?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); for(int i = 0; i < k; i++) { @@ -124,7 +122,7 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); for(int i = 0; i < clusters.size(); i++) { - MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); } return result; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index 56492dd0..c729eb10 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -93,15 +93,17 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> * * @param database Database * @param relation relation to use - * @return result - * @throws IllegalStateException + * @return Clustering result */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { if(relation.size() <= 0) { return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<Vector> means = new ArrayList<Vector>(k); + for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { + means.add(nv.getColumnVector()); + } // Initialize cluster and assign objects List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); for(int i = 0; i < k; i++) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index c7a2fa1d..9afeff6c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -26,19 +26,18 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -59,7 +58,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; * @param <D> Distance type */ @Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494") -public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> { +public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { /** * Constructor. * @@ -70,7 +69,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("K-Means++ initialization can only be used with numerical distances."); @@ -80,14 +79,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); // Chose first mean - List<Vector> means = new ArrayList<Vector>(k); + List<V> means = new ArrayList<V>(k); Random random = (seed != null) ? new Random(seed) : new Random(); - DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next(); - means.add(relation.get(first).getColumnVector()); + DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iter().getDBID(); + means.add(relation.get(first)); - ModifiableDBIDs chosen = DBIDUtil.newHashSet(k); - chosen.add(first); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); // Initialize weights double[] weights = new double[ids.size()]; @@ -107,16 +104,16 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends } // Add new mean: DBID newmean = ids.get(pos); - means.add(relation.get(newmean).getColumnVector()); - chosen.add(newmean); + means.add(relation.get(newmean)); // Update weights: weights[pos] = 0.0; // Choose optimized version for double distances, if applicable. - if (distF instanceof PrimitiveDoubleDistanceFunction) { + if(distF instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF; weightsum = updateWeights(weights, ids, newmean, ddist, relation); - } else { + } + else { weightsum = updateWeights(weights, ids, newmean, distQ); } } @@ -124,6 +121,48 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends return means; } + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; + // Chose first mean + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + + Random random = (seed != null) ? new Random(seed) : new Random(); + DBID first = DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random.nextLong()).iter().getDBID(); + means.add(first); + + ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); + // Initialize weights + double[] weights = new double[ids.size()]; + double weightsum = initialWeights(weights, ids, first, distQ); + while(means.size() < k) { + if(weightsum > Double.MAX_VALUE) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?"); + } + if(weightsum < Double.MIN_NORMAL) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); + } + double r = random.nextDouble() * weightsum; + int pos = 0; + while(r > 0 && pos < weights.length) { + r -= weights[pos]; + pos++; + } + // Add new mean: + DBID newmean = ids.get(pos); + means.add(newmean); + // Update weights: + weights[pos] = 0.0; + weightsum = updateWeights(weights, ids, newmean, distQ); + } + + return means; + } + /** * Initialize the weight list. * @@ -133,16 +172,15 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends * @param distQ Distance query * @return Weight sum */ - protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) { double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); - if(latest.equals(id)) { + if(latest.sameDBID(it)) { weights[i] = 0.0; } else { - double d = distQ.distance(latest, id).doubleValue(); + double d = distQ.distance(latest, it).doubleValue(); weights[i] = d * d; } weightsum += weights[i]; @@ -159,13 +197,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends * @param distQ Distance query * @return Weight sum */ - protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) { double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); if(weights[i] > 0.0) { - double d = distQ.distance(latest, id).doubleValue(); + double d = distQ.distance(latest, it).doubleValue(); weights[i] = Math.min(weights[i], d * d); weightsum += weights[i]; } @@ -187,9 +224,8 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); if(weights[i] > 0.0) { - double d = distF.doubleDistance(lv, rel.get(id)); + double d = distF.doubleDistance(lv, rel.get(it)); weights[i] = Math.min(weights[i], d * d); weightsum += weights[i]; } @@ -204,7 +240,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { return new KMeansPlusPlusInitialMeans<V, D>(seed); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java new file mode 100644 index 00000000..8c284981 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -0,0 +1,172 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medians clustering algorithm, using Lloyd-style bulk + * iterations. + * + * Reference: + * <p> + * Clustering via Concave Minimization<br /> + * P. S. Bradley, O. L. Mangasarian, W. N. Street<br /> + * in: Advances in neural information processing systems + * </p> + * + * @author Erich Schubert + * + * @apiviz.has MeanModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +@Title("K-Medians") +@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url="http://nips.djvuzone.org/djvu/nips09/0368.djvu") +public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMediansLloyd.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-medians + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + } + // Choose initial medians + List<? extends NumberVector<?, ?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Medians iteration " + (iteration + 1)); + } + boolean changed = assignToNearestCluster(relation, medians, clusters); + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute medians. + medians = medians(clusters, medians, relation); + } + // Wrap result + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); + result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans<V, D> makeInstance() { + return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java new file mode 100644 index 00000000..a5c3d675 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -0,0 +1,271 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medoids clustering algorithm, using a "bulk" variation of the + * "Partitioning Around Medoids" approach. + * + * In contrast to PAM, which will in each iteration update one medoid with one + * (arbitrary) non-medoid, this implementation follows the EM pattern. In the + * expectation step, the best medoid from the cluster members is chosen; in the + * M-step, the objects are reassigned to their nearest medoid. + * + * We do not have a reference for this algorithm. It borrows ideas from EM and + * PAM. If needed, you are welcome cite it using the latest ELKI publication + * (this variation is likely not worth publishing on its own). + * + * @author Erich Schubert + * + * @apiviz.has MedoidModel + * @apiviz.composedOf KMedoidsInitialization + * + * @param <V> vector datatype + * @param <D> distance value type + */ +public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMedoidsEM.class); + + /** + * Holds the value of {@link AbstractKMeans#K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMedoidsInitialization<V> initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means + */ + public KMedoidsEM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Run k-medoids + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering<MedoidModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + } + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + Mean[] mdists = Mean.newArray(k); + + // Initial assignment to nearest medoids + // TODO: reuse this information, from the build phase, when possible? + assignToNearestCluster(medoids, mdists, clusters, distQ); + + // Swap phase + boolean changed = true; + while(changed) { + changed = false; + // Try to swap the medoid with a better cluster member: + for(int i = 0; i < k; i++) { + DBID med = medoids.get(i); + DBID best = null; + Mean bestm = mdists[i]; + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(med.sameDBID(iter)) { + continue; + } + Mean mdist = new Mean(); + for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { + mdist.put(distQ.distance(iter, iter2).doubleValue()); + } + if(mdist.getMean() < bestm.getMean()) { + best = iter.getDBID(); + bestm = mdist; + } + } + if(best != null && !med.sameDBID(best)) { + changed = true; + medoids.set(i, best); + mdists[i] = bestm; + } + } + // Reassign + if(changed) { + assignToNearestCluster(medoids, mdists, clusters, distQ); + } + } + + // Wrap result + Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param means a list of k means + * @param mdist Mean distances + * @param clusters cluster assignment + * @param distQ distance query + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + boolean changed = false; + + double[] dists = new double[k]; + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + int minIndex = 0; + double mindist = Double.POSITIVE_INFINITY; + for(int i = 0; i < k; i++) { + dists[i] = distQ.distance(iditer, means.get(i)).doubleValue(); + if(dists[i] < mindist) { + minIndex = i; + mindist = dists[i]; + } + } + if(clusters.get(minIndex).add(iditer)) { + changed = true; + mdist[minIndex].put(mindist); + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { + mdist[minIndex].put(dists[i], -1); + break; + } + } + } + } + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + protected int k; + + protected int maxiter; + + protected KMedoidsInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected KMedoidsEM<V, D> makeInstance() { + return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java new file mode 100644 index 00000000..269e7e9e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java @@ -0,0 +1,45 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; + +/** + * Interface for initializing K-Medoids. In contrast to k-means initializers, + * this initialization will only return members of the original data set. + * + * @author Erich Schubert + * + * @param <V> Object type + */ +public interface KMedoidsInitialization<V> { + /** + * Choose initial means + * + * @param k Parameter k + * @param distanceFunction Distance function + * @return List of chosen means for k-means + */ + public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java new file mode 100644 index 00000000..30c80084 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -0,0 +1,310 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medoids clustering algorithm, using the + * "Partitioning Around Medoids" approach. + * + * Reference: + * <p> + * Clustering my means of Medoids<br /> + * Kaufman, L. and Rousseeuw, P.J.<br /> + * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + * </p> + * + * @author Erich Schubert + * + * @apiviz.has MedoidModel + * @apiviz.composedOf KMedoidsInitialization + * + * @param <V> vector datatype + * @param <D> distance value type + */ +@Title("Partioning Around Medoids") +@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMedoidsPAM.class); + + /** + * Holds the value of {@link AbstractKMeans#K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMedoidsInitialization<V> initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means + */ + public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Run k-medoids + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering<MedoidModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + } + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + DBIDs ids = relation.getDBIDs(); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + // Initial assignment to nearest medoids + // TODO: reuse this information, from the build phase, when possible? + assignToNearestCluster(medoids, ids, second, clusters, distQ); + + // Swap phase + boolean changed = true; + while(changed) { + changed = false; + // Try to swap the medoid with a better cluster member: + double best = 0; + DBID bestid = null; + int bestcluster = -1; + for(int i = 0; i < k; i++) { + DBID med = medoids.get(i); + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(med.sameDBID(iter)) { + continue; + } + // double disti = distQ.distance(id, med).doubleValue(); + double cost = 0; + for(int j = 0; j < k; j++) { + for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { + double distcur = distQ.distance(iter2, medoids.get(j)).doubleValue(); + double distnew = distQ.distance(iter2, iter).doubleValue(); + if(j == i) { + // Cases 1 and 2. + double distsec = second.doubleValue(iter2); + if(distcur > distsec) { + // Case 1, other would switch to a third medoid + cost += distsec - distcur; // Always positive! + } + else { // Would remain with the candidate + cost += distnew - distcur; // Could be negative + } + } + else { + // Cases 3-4: objects from other clusters + if (distcur < distnew) { + // Case 3: no change + } else { + // Case 4: would switch to new medoid + cost += distnew - distcur; // Always negative + } + } + } + } + if (cost < best) { + best = cost; + bestid = iter.getDBID(); + bestcluster = i; + } + } + } + if(logger.isDebugging()) { + logger.debug("Best cost: " + best); + } + if(bestid != null) { + changed = true; + medoids.set(bestcluster, bestid); + } + // Reassign + if(changed) { + // TODO: can we save some of these recomputations? + assignToNearestCluster(medoids, ids, second, clusters, distQ); + } + } + + // Wrap result + Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param means Object centroids + * @param ids Object ids + * @param second Distance to second nearest medoid + * @param clusters cluster assignment + * @param distQ distance query + * @return true when any object was reassigned + */ + protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + boolean changed = false; + + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + int minIndex = 0; + double mindist = Double.POSITIVE_INFINITY; + double mindist2 = Double.POSITIVE_INFINITY; + for(int i = 0; i < k; i++) { + double dist = distQ.distance(iditer, means.get(i)).doubleValue(); + if(dist < mindist) { + minIndex = i; + mindist2 = mindist; + mindist = dist; + } + else if(dist < mindist2) { + mindist2 = dist; + } + } + if(clusters.get(minIndex).add(iditer)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { + break; + } + } + } + } + second.put(iditer, mindist2); + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + protected int k; + + protected int maxiter; + + protected KMedoidsInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected KMedoidsPAM<V, D> makeInstance() { + return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java new file mode 100644 index 00000000..094c37bb --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java @@ -0,0 +1,187 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * PAM initialization for k-means (and of course, PAM). + * + * Reference: + * <p> + * Clustering my means of Medoids<br /> + * Kaufman, L. and Rousseeuw, P.J.<br /> + * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + * </p> + * + * TODO: enforce using a distance matrix? + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMeansInitialization<V>, KMedoidsInitialization<V> { + /** + * Constructor. + */ + public PAMInitialMeans() { + super(); + } + + @Override + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + // Get a distance query + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; + final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + DBIDs medids = chooseInitialMedoids(k, distQ); + List<V> medoids = new ArrayList<V>(k); + for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) { + medoids.add(relation.get(iter)); + } + return medoids; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; + final DBIDs ids = distQ.getRelation().getDBIDs(); + + ArrayModifiableDBIDs medids = DBIDUtil.newArray(k); + double best = Double.POSITIVE_INFINITY; + Mean mean = new Mean(); // Mean is numerically more stable than sum. + WritableDoubleDataStore mindist = null; + + // First mean is chosen by having the smallest distance sum to all others. + { + DBID bestid = null; + WritableDoubleDataStore bestd = null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + mean.reset(); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + double d = distQ.distance(iter, iter2).doubleValue(); + mean.put(d); + newd.putDouble(iter2, d); + } + if(mean.getMean() < best) { + best = mean.getMean(); + bestid = iter.getDBID(); + if(bestd != null) { + bestd.destroy(); + } + bestd = newd; + } + else { + newd.destroy(); + } + } + medids.add(bestid); + mindist = bestd; + } + assert (mindist != null); + + // Subsequent means optimize the full criterion. + for(int i = 1; i < k; i++) { + DBID bestid = null; + WritableDoubleDataStore bestd = null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); + if(medids.contains(id)) { + continue; + } + WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + mean.reset(); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + DBID other = iter2.getDBID(); + double dn = distQ.distance(id, other).doubleValue(); + double v = Math.min(dn, mindist.doubleValue(other)); + mean.put(v); + newd.put(other, v); + } + assert (mean.getCount() == ids.size()); + if(mean.getMean() < best) { + best = mean.getMean(); + bestid = id; + if(bestd != null) { + bestd.destroy(); + } + bestd = newd; + } + else { + newd.destroy(); + } + } + if(bestid == null) { + throw new AbortException("No median found that improves the criterion function?!?"); + } + medids.add(bestid); + mindist.destroy(); + mindist = bestd; + } + + mindist.destroy(); + return medids; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + @Override + protected PAMInitialMeans<V, D> makeInstance() { + return new PAMInitialMeans<V, D>(); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java index 30e59453..5b9da923 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -25,13 +25,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** * Initialize K-means by randomly choosing k exsiting elements as cluster @@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; * * @param <V> Vector type */ -public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { +public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { /** * Constructor. * @@ -52,15 +51,20 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed); - List<Vector> means = new ArrayList<Vector>(k); - for(DBID id : ids) { - means.add(relation.get(id).getColumnVector()); + List<V> means = new ArrayList<V>(k); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + means.add(relation.get(iter)); } return means; } + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { + return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, seed); + } + /** * Parameterization class. * @@ -68,7 +72,7 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected RandomlyChosenInitialMeans<V> makeInstance() { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index e8a466dd..00ed08c4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -30,7 +30,6 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.math.MathUtil; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -53,10 +52,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { final int dim = DatabaseUtil.dimensionality(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<Vector> means = new ArrayList<Vector>(k); + List<V> means = new ArrayList<V>(k); final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); @@ -64,12 +63,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends for(int d = 0; d < dim; d++) { r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; } - means.add(new Vector(r)); + means.add(minmax.first.newNumberVector(r)); } return means; } - /** * Parameterization class. * @@ -78,7 +76,6 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { - @Override protected RandomlyGeneratedInitialMeans<V> makeInstance() { return new RandomlyGeneratedInitialMeans<V>(seed); |