diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
14 files changed, 309 insertions, 239 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index 92862909..47855aad 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -27,10 +27,12 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension; import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; @@ -50,12 +52,14 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; * * @author Erich Schubert * + * @apiviz.has MeanModel * @apiviz.composedOf KMeansInitialization * * @param <V> Vector type * @param <D> Distance type + * @param <M> Cluster model type */ -public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> implements KMeans { +public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans, ClusteringAlgorithm<Clustering<M>> { /** * Holds the value of {@link #K_ID}. */ @@ -79,7 +83,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param maxiter Maxiter parameter * @param initializer Function to generate the initial means */ - public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public AbstractKMeans(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { super(distanceFunction); this.k = k; this.maxiter = maxiter; @@ -95,12 +99,12 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param clusters cluster assignment * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?, ?>> means, List<? extends ModifiableDBIDs> clusters) { + protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) { boolean changed = false; if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") - final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); + final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); @@ -127,7 +131,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } } else { - final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); + final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); @@ -158,7 +162,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis @Override public TypeInformation[] getInputTypeRestriction() { - return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + return TypeUtil.array(new CombinedTypeInformation(TypeUtil.NUMBER_VECTOR_FIELD, getDistanceFunction().getInputTypeRestriction())); } /** @@ -169,7 +173,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> means, Relation<V> database) { + protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) { List<Vector> newMeans = new ArrayList<Vector>(k); for(int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); @@ -200,30 +204,30 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List<NumberVector<?, ?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> medians, Relation<V> database) { + protected List<NumberVector<?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> medians, Relation<V> database) { final int dim = medians.get(0).getDimensionality(); final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); - List<NumberVector<?, ?>> newMedians = new ArrayList<NumberVector<?, ?>>(k); + List<NumberVector<?>> newMedians = new ArrayList<NumberVector<?>>(k); for(int i = 0; i < k; i++) { ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); if(list.size() > 0) { Vector mean = new Vector(dim); for(int d = 0; d < dim; d++) { - sorter.setDimension(d + 1); + sorter.setDimension(d); DBID id = QuickSelect.median(list, sorter); - mean.set(d, database.get(id).doubleValue(d + 1)); + mean.set(d, database.get(id).doubleValue(d)); } newMedians.add(mean); } else { - newMedians.add((NumberVector<?, ?>) medians.get(i)); + newMedians.add((NumberVector<?>) medians.get(i)); } } return newMedians; } /** - * Compute an incremental update for the mean + * Compute an incremental update for the mean. * * @param mean Mean to update * @param vec Object vector @@ -255,7 +259,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") - final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); + final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); // Incremental update for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { @@ -287,7 +291,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } else { // Raw distance function - final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); + final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); // Incremental update for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { @@ -319,4 +323,4 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } return changed; } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java index a8effecd..3a69c806 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -22,9 +22,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; /** * Abstract base class for common k-means initializations. @@ -35,17 +36,17 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; */ public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> { /** - * Holds the value of {@link KMeans#SEED_ID}. + * Random number generator */ - protected Long seed; + protected RandomFactory rnd; /** * Constructor. * - * @param seed Random seed. + * @param rnd Random number generator. */ - public AbstractKMeansInitialization(Long seed) { - this.seed = seed; + public AbstractKMeansInitialization(RandomFactory rnd) { + this.rnd = rnd; } /** @@ -56,14 +57,17 @@ public abstract class AbstractKMeansInitialization<V> implements KMeansInitializ * @apiviz.exclude */ public abstract static class Parameterizer<V> extends AbstractParameterizer { - protected Long seed; + /** + * Random generator + */ + protected RandomFactory rnd; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - LongParameter seedP = new LongParameter(KMeans.SEED_ID, true); - if(config.grab(seedP)) { - seed = seedP.getValue(); + RandomParameter rndP = new RandomParameter(KMeans.SEED_ID); + if(config.grab(rndP)) { + rnd = rndP.getValue(); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java index 7a7f2867..1e51f4d6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -77,7 +77,7 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { @Override protected FirstKInitialMeans<V> makeInstance() { return new FirstKInitialMeans<V>(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java index 37171d4a..68fc4e48 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -34,22 +34,22 @@ public interface KMeans { /** * Parameter to specify the initialization method */ - public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + public static final OptionID INIT_ID = new OptionID("kmeans.initialization", "Method to choose the initial means."); /** * Parameter to specify the number of clusters to find, must be an integer * greater than 0. */ - public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); + public static final OptionID K_ID = new OptionID("kmeans.k", "The number of clusters to find."); /** * Parameter to specify the number of clusters to find, must be an integer * greater or equal to 0, where 0 means no limit. */ - public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); + public static final OptionID MAXITER_ID = new OptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); /** * Parameter to specify the random generator seed. */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); + public static final OptionID SEED_ID = new OptionID("kmeans.seed", "The random number generator seed."); }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java index 9e5d69f0..54b3a2ce 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -31,6 +31,8 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; * Interface for initializing K-Means * * @author Erich Schubert + * + * @apiviz.landmark * * @param <V> Object type */ @@ -44,4 +46,4 @@ public interface KMeansInitialization<V> { * @return List of chosen means for k-means */ public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index b1b40632..f43c2277 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -27,19 +27,20 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -62,7 +63,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @author Arthur Zimek * - * @apiviz.has MeanModel + * @apiviz.landmark + * @apiviz.has KMeansModel * * @param <V> vector datatype * @param <D> distance value type @@ -70,11 +72,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @Title("K-Means") @Description("Finds a partitioning into k clusters.") @Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489") -public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { +public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(KMeansLloyd.class); + private static final Logging LOG = Logging.getLogger(KMeansLloyd.class); /** * Constructor. @@ -82,55 +84,56 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter + * @param initializer Initialization method */ - public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { super(distanceFunction, k, maxiter, initializer); } /** - * Run k-means + * Run k-means. * * @param database Database * @param relation relation to use * @return result */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { - if(relation.size() <= 0) { - return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + if (relation.size() <= 0) { + return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<? extends NumberVector<?, ?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } - for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if(logger.isVerbose()) { - logger.verbose("K-Means iteration " + (iteration + 1)); + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if (LOG.isVerbose()) { + LOG.verbose("K-Means iteration " + (iteration + 1)); } boolean changed = assignToNearestCluster(relation, means, clusters); // Stop if no cluster assignment changed. - if(!changed) { + if (!changed) { break; } // Recompute means. means = means(clusters, means, relation); } // Wrap result - final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); - Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); - for(int i = 0; i < clusters.size(); i++) { - MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); - result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { + KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addCluster(new Cluster<KMeansModel<V>>(clusters.get(i), model)); } return result; } @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -140,35 +143,53 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { + /** + * k Parameter. + */ protected int k; + /** + * Number of iterations. + */ protected int maxiter; + /** + * Initialization method. + */ protected KMeansInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { + ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); + if(config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); + } + } + + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { k = kP.getValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if(config.grab(initialP)) { + if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); + IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { + maxiter = maxiterP.intValue(); } } @Override - protected AbstractKMeans<V, D> makeInstance() { + protected KMeansLloyd<V, D> makeInstance() { return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index c729eb10..0cc7c363 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -27,21 +27,22 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -62,8 +63,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * </p> * * @author Erich Schubert - * - * @apiviz.has MeanModel + * @apiviz.has KMeansModel * * @param <V> vector type to use * @param <D> distance function value type @@ -71,11 +71,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @Title("K-Means") @Description("Finds a partitioning into k clusters.") @Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992") -public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { +public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(KMeansMacQueen.class); + private static final Logging LOG = Logging.getLogger(KMeansMacQueen.class); /** * Constructor. @@ -83,30 +83,31 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter + * @param initializer Initialization method */ - public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { super(distanceFunction, k, maxiter, initializer); } /** - * Run k-means + * Run k-means. * * @param database Database * @param relation relation to use * @return Clustering result */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { - if(relation.size() <= 0) { - return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + if (relation.size() <= 0) { + return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means List<Vector> means = new ArrayList<Vector>(k); - for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { + for (NumberVector<?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { means.add(nv.getColumnVector()); } // Initialize cluster and assign objects List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } assignToNearestCluster(relation, means, clusters); @@ -114,28 +115,28 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> means = means(clusters, means, relation); // Refine result - for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if(logger.isVerbose()) { - logger.verbose("K-Means iteration " + (iteration + 1)); + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if (LOG.isVerbose()) { + LOG.verbose("K-Means iteration " + (iteration + 1)); } boolean changed = macQueenIterate(relation, means, clusters); - if(!changed) { + if (!changed) { break; } } - final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); - Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); - for(int i = 0; i < clusters.size(); i++) { + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); - MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); - result.addCluster(new Cluster<MeanModel<V>>(ids, model)); + KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addCluster(new Cluster<KMeansModel<V>>(ids, model)); } return result; } @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -145,35 +146,53 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { + /** + * k Parameter. + */ protected int k; + /** + * Maximum number of iterations. + */ protected int maxiter; + /** + * Initialization method. + */ protected KMeansInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { + ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); + if (config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); + } + } + + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { k = kP.getValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if(config.grab(initialP)) { + if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { + IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { maxiter = maxiterP.getValue(); } } @Override - protected AbstractKMeans<V, D> makeInstance() { + protected KMeansMacQueen<V, D> makeInstance() { return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index 9afeff6c..a07953da 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -62,10 +63,10 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten /** * Constructor. * - * @param seed Random seed. + * @param rnd Random generator. */ - public KMeansPlusPlusInitialMeans(Long seed) { - super(seed); + public KMeansPlusPlusInitialMeans(RandomFactory rnd) { + super(rnd); } @Override @@ -81,8 +82,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean List<V> means = new ArrayList<V>(k); - Random random = (seed != null) ? new Random(seed) : new Random(); - DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iter().getDBID(); + Random random = rnd.getRandom(); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter()); means.add(relation.get(first)); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); @@ -131,8 +132,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - Random random = (seed != null) ? new Random(seed) : new Random(); - DBID first = DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random.nextLong()).iter().getDBID(); + Random random = rnd.getRandom(); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, new Random(random.nextLong())).iter()); means.add(first); ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); @@ -176,7 +177,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - if(latest.sameDBID(it)) { + if(DBIDUtil.equal(latest, it)) { weights[i] = 0.0; } else { @@ -243,7 +244,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { - return new KMeansPlusPlusInitialMeans<V, D>(seed); + return new KMeansPlusPlusInitialMeans<V, D>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index 8c284981..9917337e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -27,7 +27,6 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -36,10 +35,10 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; @@ -61,18 +60,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @author Erich Schubert * - * @apiviz.has MeanModel - * * @param <V> vector datatype * @param <D> distance value type */ @Title("K-Medians") -@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url="http://nips.djvuzone.org/djvu/nips09/0368.djvu") -public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { +@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url = "http://nips.djvuzone.org/djvu/nips09/0368.djvu") +public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, MeanModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(KMediansLloyd.class); + private static final Logging LOG = Logging.getLogger(KMediansLloyd.class); /** * Constructor. @@ -80,46 +77,47 @@ public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter + * @param initializer Initialization method */ - public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { super(distanceFunction, k, maxiter, initializer); } /** - * Run k-medians + * Run k-medians. * * @param database Database * @param relation relation to use * @return result */ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { - if(relation.size() <= 0) { + if (relation.size() <= 0) { return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); } // Choose initial medians - List<? extends NumberVector<?, ?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } - for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if(logger.isVerbose()) { - logger.verbose("K-Medians iteration " + (iteration + 1)); + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if (LOG.isVerbose()) { + LOG.verbose("K-Medians iteration " + (iteration + 1)); } boolean changed = assignToNearestCluster(relation, medians, clusters); // Stop if no cluster assignment changed. - if(!changed) { + if (!changed) { break; } // Recompute medians. medians = medians(clusters, medians, relation); } // Wrap result - final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); - for(int i = 0; i < clusters.size(); i++) { + for (int i = 0; i < clusters.size(); i++) { MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); } @@ -128,7 +126,7 @@ public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -138,35 +136,46 @@ public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { + /** + * k Parameter. + */ protected int k; + /** + * Maximum number of iterations. + */ protected int maxiter; + /** + * Initialization method. + */ protected KMeansInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { - k = kP.getValue(); + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { + k = kP.intValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if(config.grab(initialP)) { + if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); + IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { + maxiter = maxiterP.intValue(); } } @Override - protected AbstractKMeans<V, D> makeInstance() { + protected KMediansLloyd<V, D> makeInstance() { return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index a5c3d675..f4398458 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -78,7 +78,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(KMedoidsEM.class); + private static final Logging LOG = Logging.getLogger(KMedoidsEM.class); /** * Holds the value of {@link AbstractKMeans#K_ID}. @@ -118,7 +118,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista * @return result */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { - if(relation.size() <= 0) { + if (relation.size() <= 0) { return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); @@ -126,7 +126,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } Mean[] mdists = Mean.newArray(k); @@ -137,41 +137,41 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista // Swap phase boolean changed = true; - while(changed) { + while (changed) { changed = false; // Try to swap the medoid with a better cluster member: - for(int i = 0; i < k; i++) { - DBID med = medoids.get(i); + int i = 0; + for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { DBID best = null; Mean bestm = mdists[i]; - for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { - if(med.sameDBID(iter)) { + for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if (DBIDUtil.equal(miter, iter)) { continue; } Mean mdist = new Mean(); - for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { + for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { mdist.put(distQ.distance(iter, iter2).doubleValue()); } - if(mdist.getMean() < bestm.getMean()) { - best = iter.getDBID(); + if (mdist.getMean() < bestm.getMean()) { + best = DBIDUtil.deref(iter); bestm = mdist; } } - if(best != null && !med.sameDBID(best)) { + if (best != null && !DBIDUtil.equal(miter, best)) { changed = true; medoids.set(i, best); mdists[i] = bestm; } } // Reassign - if(changed) { + if (changed) { assignToNearestCluster(medoids, mdists, clusters, distQ); } } // Wrap result Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); - for(int i = 0; i < clusters.size(); i++) { + for (int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); } @@ -192,24 +192,27 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista boolean changed = false; double[] dists = new double[k]; - for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { int minIndex = 0; double mindist = Double.POSITIVE_INFINITY; - for(int i = 0; i < k; i++) { - dists[i] = distQ.distance(iditer, means.get(i)).doubleValue(); - if(dists[i] < mindist) { - minIndex = i; - mindist = dists[i]; + { + int i = 0; + for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { + dists[i] = distQ.distance(iditer, miter).doubleValue(); + if (dists[i] < mindist) { + minIndex = i; + mindist = dists[i]; + } } } - if(clusters.get(minIndex).add(iditer)) { + if (clusters.get(minIndex).add(iditer)) { changed = true; mdist[minIndex].put(mindist); // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { + for (int i = 0; i < k; i++) { + if (i != minIndex) { + if (clusters.get(i).remove(iditer)) { mdist[minIndex].put(dists[i], -1); break; } @@ -227,7 +230,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -247,19 +250,21 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { - k = kP.getValue(); + IntParameter kP = new IntParameter(KMeans.K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { + k = kP.intValue(); } ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); - if(config.grab(initialP)) { + if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { + maxiter = maxiterP.intValue(); } } @@ -268,4 +273,4 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index 30c80084..906501e4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -83,7 +83,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(KMedoidsPAM.class); + private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class); /** * Holds the value of {@link AbstractKMeans#K_ID}. @@ -123,7 +123,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @return result */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { - if(relation.size() <= 0) { + if (relation.size() <= 0) { return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); @@ -132,7 +132,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -143,36 +143,35 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist // Swap phase boolean changed = true; - while(changed) { + while (changed) { changed = false; // Try to swap the medoid with a better cluster member: double best = 0; DBID bestid = null; int bestcluster = -1; - for(int i = 0; i < k; i++) { - DBID med = medoids.get(i); - for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { - if(med.sameDBID(iter)) { + int i = 0; + for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { + for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if (DBIDUtil.equal(miter, iter)) { continue; } // double disti = distQ.distance(id, med).doubleValue(); double cost = 0; - for(int j = 0; j < k; j++) { - for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { - double distcur = distQ.distance(iter2, medoids.get(j)).doubleValue(); + DBIDIter olditer = medoids.iter(); + for (int j = 0; j < k; j++, olditer.advance()) { + for (DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { + double distcur = distQ.distance(iter2, olditer).doubleValue(); double distnew = distQ.distance(iter2, iter).doubleValue(); - if(j == i) { + if (j == i) { // Cases 1 and 2. double distsec = second.doubleValue(iter2); - if(distcur > distsec) { + if (distcur > distsec) { // Case 1, other would switch to a third medoid cost += distsec - distcur; // Always positive! - } - else { // Would remain with the candidate + } else { // Would remain with the candidate cost += distnew - distcur; // Could be negative } - } - else { + } else { // Cases 3-4: objects from other clusters if (distcur < distnew) { // Case 3: no change @@ -185,20 +184,20 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } if (cost < best) { best = cost; - bestid = iter.getDBID(); + bestid = DBIDUtil.deref(iter); bestcluster = i; } } } - if(logger.isDebugging()) { - logger.debug("Best cost: " + best); + if (LOG.isDebugging()) { + LOG.debug("Best cost: " + best); } - if(bestid != null) { + if (bestid != null) { changed = true; medoids.set(bestcluster, bestid); } // Reassign - if(changed) { + if (changed) { // TODO: can we save some of these recomputations? assignToNearestCluster(medoids, ids, second, clusters, distQ); } @@ -206,7 +205,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist // Wrap result Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); - for(int i = 0; i < clusters.size(); i++) { + for (int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); } @@ -227,28 +226,30 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { boolean changed = false; - for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { int minIndex = 0; double mindist = Double.POSITIVE_INFINITY; double mindist2 = Double.POSITIVE_INFINITY; - for(int i = 0; i < k; i++) { - double dist = distQ.distance(iditer, means.get(i)).doubleValue(); - if(dist < mindist) { - minIndex = i; - mindist2 = mindist; - mindist = dist; - } - else if(dist < mindist2) { - mindist2 = dist; + { + int i = 0; + for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { + double dist = distQ.distance(iditer, miter).doubleValue(); + if (dist < mindist) { + minIndex = i; + mindist2 = mindist; + mindist = dist; + } else if (dist < mindist2) { + mindist2 = dist; + } } } - if(clusters.get(minIndex).add(iditer)) { + if (clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { + for (int i = 0; i < k; i++) { + if (i != minIndex) { + if (clusters.get(i).remove(iditer)) { break; } } @@ -266,7 +267,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -286,19 +287,21 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { - k = kP.getValue(); + IntParameter kP = new IntParameter(KMeans.K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { + k = kP.intValue(); } ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); - if(config.grab(initialP)) { + if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { + maxiter = maxiterP.intValue(); } } @@ -307,4 +310,4 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java index 094c37bb..1fc7160e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java @@ -113,7 +113,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean } if(mean.getMean() < best) { best = mean.getMean(); - bestid = iter.getDBID(); + bestid = DBIDUtil.deref(iter); if(bestd != null) { bestd.destroy(); } @@ -133,23 +133,21 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean DBID bestid = null; WritableDoubleDataStore bestd = null; for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); - if(medids.contains(id)) { + if(medids.contains(iter)) { continue; } WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); mean.reset(); for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - DBID other = iter2.getDBID(); - double dn = distQ.distance(id, other).doubleValue(); - double v = Math.min(dn, mindist.doubleValue(other)); + double dn = distQ.distance(iter, iter2).doubleValue(); + double v = Math.min(dn, mindist.doubleValue(iter2)); mean.put(v); - newd.put(other, v); + newd.put(iter2, v); } assert (mean.getCount() == ids.size()); if(mean.getMean() < best) { best = mean.getMean(); - bestid = id; + bestid = DBIDUtil.deref(iter); if(bestd != null) { bestd.destroy(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java index 5b9da923..78e59be7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; /** * Initialize K-means by randomly choosing k exsiting elements as cluster @@ -44,15 +45,15 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< /** * Constructor. * - * @param seed Random seed. + * @param rnd Random generator. */ - public RandomlyChosenInitialMeans(Long seed) { - super(seed); + public RandomlyChosenInitialMeans(RandomFactory rnd) { + super(rnd); } @Override public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { - DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed); + DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd); List<V> means = new ArrayList<V>(k); for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { means.add(relation.get(iter)); @@ -62,7 +63,7 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< @Override public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { - return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, seed); + return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, rnd); } /** @@ -76,7 +77,7 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< @Override protected RandomlyChosenInitialMeans<V> makeInstance() { - return new RandomlyChosenInitialMeans<V>(seed); + return new RandomlyChosenInitialMeans<V>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index 00ed08c4..300f5cb0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -28,9 +28,11 @@ import java.util.Random; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** @@ -41,29 +43,30 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param <V> Vector type */ -public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { +public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends AbstractKMeansInitialization<V> { /** * Constructor. * - * @param seed Random seed. + * @param rnd Random generator. */ - public RandomlyGeneratedInitialMeans(Long seed) { - super(seed); + public RandomlyGeneratedInitialMeans(RandomFactory rnd) { + super(rnd); } @Override public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { - final int dim = DatabaseUtil.dimensionality(relation); + final int dim = RelationUtil.dimensionality(relation); + NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); List<V> means = new ArrayList<V>(k); - final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); + final Random random = rnd.getRandom(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); // Rescale for(int d = 0; d < dim; d++) { - r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; + r[d] = minmax.first.doubleValue(d) + (minmax.second.doubleValue(d) - minmax.first.doubleValue(d)) * r[d]; } - means.add(minmax.first.newNumberVector(r)); + means.add(factory.newNumberVector(r)); } return means; } @@ -75,10 +78,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected RandomlyGeneratedInitialMeans<V> makeInstance() { - return new RandomlyGeneratedInitialMeans<V>(seed); + return new RandomlyGeneratedInitialMeans<V>(rnd); } } }
\ No newline at end of file |