diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
24 files changed, 1330 insertions, 293 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index 47855aad..dc1fa47c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -43,9 +43,17 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Abstract base class for k-means implementations. @@ -59,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; * @param <D> Distance type * @param <M> Cluster model type */ -public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans, ClusteringAlgorithm<Clustering<M>> { +public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans<V, D, M>, ClusteringAlgorithm<Clustering<M>> { /** * Holds the value of {@link #K_ID}. */ @@ -102,54 +110,53 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) { boolean changed = false; - if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if(dist < mindist) { + if (dist < mindist) { minIndex = i; mindist = dist; } } - if(clusters.get(minIndex).add(iditer)) { + if (clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { + for (int i = 0; i < k; i++) { + if (i != minIndex) { + if (clusters.get(i).remove(iditer)) { break; } } } } } - } - else { + } else { final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if(dist.compareTo(mindist) < 0) { + if (dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } - if(clusters.get(minIndex).add(iditer)) { + if (clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { + for (int i = 0; i < k; i++) { + if (i != minIndex) { + if (clusters.get(i).remove(iditer)) { break; } } @@ -174,21 +181,24 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @return the mean vectors of the given clusters in the given database */ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) { - List<Vector> newMeans = new ArrayList<Vector>(k); - for(int i = 0; i < k; i++) { + List<Vector> newMeans = new ArrayList<>(k); + for (int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - if(list.size() > 0) { + if (list.size() > 0) { double s = 1.0 / list.size(); DBIDIter iter = list.iter(); assert (iter.valid()); mean = database.get(iter).getColumnVector().timesEquals(s); + double[] raw = mean.getArrayRef(); iter.advance(); - for(; iter.valid(); iter.advance()) { - mean.plusTimesEquals(database.get(iter).getColumnVector(), s); + for (; iter.valid(); iter.advance()) { + NumberVector<?> vec = database.get(iter); + for (int j = 0; j < mean.getDimensionality(); j++) { + raw[j] += s * vec.doubleValue(j); + } } - } - else { + } else { mean = means.get(i).getColumnVector(); } newMeans.add(mean); @@ -207,19 +217,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan protected List<NumberVector<?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> medians, Relation<V> database) { final int dim = medians.get(0).getDimensionality(); final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); - List<NumberVector<?>> newMedians = new ArrayList<NumberVector<?>>(k); - for(int i = 0; i < k; i++) { + List<NumberVector<?>> newMedians = new ArrayList<>(k); + for (int i = 0; i < k; i++) { ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); - if(list.size() > 0) { + if (list.size() > 0) { Vector mean = new Vector(dim); - for(int d = 0; d < dim; d++) { + for (int d = 0; d < dim; d++) { sorter.setDimension(d); DBID id = QuickSelect.median(list, sorter); mean.set(d, database.get(id).doubleValue(d)); } newMedians.add(mean); - } - else { + } else { newMedians.add((NumberVector<?>) medians.get(i)); } } @@ -235,7 +244,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param op Cluster size change / Weight change */ protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) { - if(newsize == 0) { + if (newsize == 0) { return; // Keep old mean } Vector delta = vec.getColumnVector(); @@ -256,65 +265,62 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { boolean changed = false; - if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); // Incremental update - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if(dist < mindist) { + if (dist < mindist) { minIndex = i; mindist = dist; } } // Update the cluster mean incrementally: - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); - if(i == minIndex) { - if(ci.add(iditer)) { + if (i == minIndex) { + if (ci.add(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } - } - else if(ci.remove(iditer)) { + } else if (ci.remove(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } } - } - else { + } else { // Raw distance function final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); // Incremental update - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if(dist.compareTo(mindist) < 0) { + if (dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } // Update the cluster mean incrementally: - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); - if(i == minIndex) { - if(ci.add(iditer)) { + if (i == minIndex) { + if (ci.add(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } - } - else if(ci.remove(iditer)) { + } else if (ci.remove(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } @@ -323,4 +329,76 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan } return changed; } + + @Override + public void setK(int k) { + this.k = k; + } + + @Override + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + this.distanceFunction = distanceFunction; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public abstract static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { + /** + * k Parameter. + */ + protected int k; + + /** + * Maximum number of iterations. + */ + protected int maxiter; + + /** + * Initialization method. + */ + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); + if (config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); + } + } + + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class); + if (config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + /** + * Get class logger. + * + * @return Logger + */ + abstract protected Logging getLogger(); + + @Override + abstract protected AbstractKMeans<V, D, ?> makeInstance(); + } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java index 3a69c806..9e3eb478 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java new file mode 100644 index 00000000..30bb640c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java @@ -0,0 +1,219 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality.KMeansQualityMeasure; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Run K-Means multiple times, and keep the best run. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ +public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(BestOfMultipleKMeans.class); + + /** + * Number of trials to do. + */ + private int trials; + + /** + * Variant of kMeans for the bisecting step. + */ + private KMeans<V, D, M> innerkMeans; + + /** + * Quality measure which should be used. + */ + private KMeansQualityMeasure<? super V, ? super D> qualityMeasure; + + /** + * Constructor. + * + * @param trials Number of trials to do. + * @param innerkMeans K-Means variant to actually use. + * @param qualityMeasure Quality measure + */ + public BestOfMultipleKMeans(int trials, KMeans<V, D, M> innerkMeans, KMeansQualityMeasure<? super V, ? super D> qualityMeasure) { + super(); + this.trials = trials; + this.innerkMeans = innerkMeans; + this.qualityMeasure = qualityMeasure; + } + + @Override + public Clustering<M> run(Database database, Relation<V> relation) { + if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { + throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass()); + } + final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction(); + Clustering<M> bestResult = null; + if (trials > 1) { + double bestCost = Double.POSITIVE_INFINITY; + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null; + for (int i = 0; i < trials; i++) { + Clustering<M> currentCandidate = innerkMeans.run(database, relation); + double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation); + + if (LOG.isVerbose()) { + LOG.verbose("Cost of candidate " + i + ": " + currentCost); + } + + if (currentCost < bestCost) { + bestResult = currentCandidate; + bestCost = currentCost; + } + if (prog != null) { + prog.incrementProcessed(LOG); + } + } + if (prog != null) { + prog.ensureCompleted(LOG); + } + } else { + bestResult = innerkMeans.run(database); + } + + return bestResult; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return innerkMeans.getInputTypeRestriction(); + } + + @Override + public DistanceFunction<? super V, D> getDistanceFunction() { + return innerkMeans.getDistanceFunction(); + } + + @Override + public void setK(int k) { + innerkMeans.setK(k); + } + + @Override + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + innerkMeans.setDistanceFunction(distanceFunction); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractParameterizer { + /** + * Parameter to specify the iterations of the bisecting step. + */ + public static final OptionID TRIALS_ID = new OptionID("kmeans.trials", "The number of trials to run."); + + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID KMEANS_ID = new OptionID("kmeans.algorithm", "KMeans variant to run multiple times."); + + /** + * Parameter to specify the variant of quality measure. + */ + public static final OptionID QUALITYMEASURE_ID = new OptionID("kmeans.qualitymeasure", "Quality measure variant for deciding which run to keep."); + + /** + * Number of trials to perform. + */ + protected int trials; + + /** + * Variant of kMeans to use. + */ + protected KMeans<V, D, M> kMeansVariant; + + /** + * Quality measure. + */ + protected KMeansQualityMeasure<? super V, ? super D> qualityMeasure; + + @Override + protected void makeOptions(Parameterization config) { + IntParameter trialsP = new IntParameter(TRIALS_ID); + trialsP.addConstraint(new GreaterEqualConstraint(1)); + if (config.grab(trialsP)) { + trials = trialsP.intValue(); + } + + ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); + if (config.grab(kMeansVariantP)) { + kMeansVariant = kMeansVariantP.instantiateClass(config); + } + + ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class); + if (config.grab(qualityMeasureP)) { + qualityMeasure = qualityMeasureP.instantiateClass(config); + } + } + + @Override + protected BestOfMultipleKMeans<V, D, M> makeInstance() { + return new BestOfMultipleKMeans<>(trials, kMeansVariant, qualityMeasure); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java new file mode 100644 index 00000000..a018c04b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java @@ -0,0 +1,186 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; + +/** + * K-Means initialization by repeatedly choosing the farthest point. + * + * Note: this is less random than other initializations, so running multiple + * times will be more likely to return the same local minima. + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { + /** + * Discard the first vector. + */ + boolean dropfirst = true; + + /** + * Constructor. + * + * @param rnd Random generator. + * @param dropfirst Flag to discard the first vector. + */ + public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) { + super(rnd); + this.dropfirst = dropfirst; + } + + @Override + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + // Get a distance query + if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); + + // Chose first mean + List<V> means = new ArrayList<>(k); + + Random random = rnd.getRandom(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + means.add(relation.get(first)); + + DBIDVar best = DBIDUtil.newVar(first); + for (int i = (dropfirst ? 0 : 1); i < k; i++) { + // Find farthest object: + double maxdist = Double.NEGATIVE_INFINITY; + for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + double dsum = 0.; + for (V ex : means) { + dsum += distQ.distance(ex, it).doubleValue(); + } + if (dsum > maxdist) { + maxdist = dsum; + best.set(it); + } + } + // Add new mean: + if (k == 0) { + means.clear(); // Remove temporary first element. + } + means.add(relation.get(best)); + } + + return means; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { + if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; + final Relation<?> relation = distQ.getRelation(); + // Chose first mean + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + + Random random = rnd.getRandom(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + means.add(first); + + DBIDVar best = DBIDUtil.newVar(first); + for (int i = (dropfirst ? 0 : 1); i < k; i++) { + // Find farthest object: + double maxdist = Double.NEGATIVE_INFINITY; + for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + double dsum = 0.; + for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { + dsum += distQ.distance(ex, it).doubleValue(); + } + if (dsum > maxdist) { + maxdist = dsum; + best.set(it); + } + } + // Add new mean: + if (k == 0) { + means.clear(); // Remove temporary first element. + } + means.add(best); + } + + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + /** + * Option ID to control the handling of the first object chosen. + */ + public static final OptionID DROPFIRST_ID = new OptionID("farthest.dropfirst", "Drop the first object chosen (which is chosen randomly) for the farthest points heuristic."); + + /** + * Flag for discarding the first object chosen. + */ + protected boolean dropfirst = true; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + Flag dropfirstP = new Flag(DROPFIRST_ID); + if (config.grab(dropfirstP)) { + dropfirst = dropfirstP.isTrue(); + } + } + + @Override + protected FarthestPointsInitialMeans<V, D> makeInstance() { + return new FarthestPointsInitialMeans<>(rnd, dropfirst); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java index 1e51f4d6..08e2f116 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -51,9 +52,9 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { DBIDIter iter = relation.iterDBIDs(); - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { means.add(relation.get(iter)); } @@ -80,7 +81,7 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { @Override protected FirstKInitialMeans<V> makeInstance() { - return new FirstKInitialMeans<V>(); + return new FirstKInitialMeans<>(); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java index 68fc4e48..29c0a5c8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -1,12 +1,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,12 +23,27 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + /** * Some constants and options shared among kmeans family algorithms. * * @author Erich Schubert + * + * @param <V> Number vector type + * @param <D> Distance type + * @param <M> Actual model type */ -public interface KMeans { +public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V, D> { /** * Parameter to specify the initialization method */ @@ -52,4 +65,27 @@ public interface KMeans { * Parameter to specify the random generator seed. */ public static final OptionID SEED_ID = new OptionID("kmeans.seed", "The random number generator seed."); -}
\ No newline at end of file + + /** + * Run the clustering algorithm. + * + * @param database Database to run on. + * @param rel Relation to process. + * @return Clustering result + */ + Clustering<M> run(Database database, Relation<V> rel); + + /** + * Set the value of k. Needed for some types of nested k-means. + * + * @param k K parameter + */ + void setK(int k); + + /** + * Set the distance function to use. + * + * @param distanceFunction Distance function. + */ + void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java new file mode 100644 index 00000000..37071d36 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java @@ -0,0 +1,231 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.LinkedList; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ProxyDatabase; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * The bisecting k-means algorithm works by starting with an initial + * partitioning into two clusters, then repeated splitting of the largest + * cluster to get additional clusters. + * + * Reference:<br> + * <p> + * M. Steinbach, G. Karypis, V. Kumar:<br /> + * A Comparison of Document Clustering Techniques<br /> + * KDD workshop on text mining. Vol. 400. No. 1 + * </p> + * + * @author Stephan Baier + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ +@Reference(authors = "M. Steinbach, G. Karypis, V. Kumar", title = "A Comparison of Document Clustering Techniques", booktitle = "KDD workshop on text mining. Vol. 400. No. 1") +public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansBisecting.class); + + /** + * Variant of kMeans for the bisecting step. + */ + private KMeans<V, D, M> innerkMeans; + + /** + * Desired value of k. + */ + private int k; + + /** + * Constructor. + * + * @param k k parameter - number of result clusters + * @param innerkMeans KMeans variant parameter - for bisecting step + */ + public KMeansBisecting(int k, KMeans<V, D, M> innerkMeans) { + super(); + this.k = k; + this.innerkMeans = innerkMeans; + } + + @Override + public Clustering<M> run(Database database, Relation<V> relation) { + ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database); + + // Linked list is preferrable for scratch, as we will A) not need that many + // clusters and B) be doing random removals of the largest cluster (often at + // the head) + LinkedList<Cluster<M>> currentClusterList = new LinkedList<>(); + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null; + + for (int j = 0; j < this.k - 1; j++) { + // Choose a cluster to split and project database to cluster + if (currentClusterList.size() == 0) { + proxyDB = new ProxyDatabase(relation.getDBIDs(), database); + } else { + Cluster<M> largestCluster = null; + for (Cluster<M> cluster : currentClusterList) { + if (largestCluster == null || cluster.size() > largestCluster.size()) { + largestCluster = cluster; + } + } + currentClusterList.remove(largestCluster); + proxyDB.setDBIDs(largestCluster.getIDs()); + } + + // Run the inner k-means algorithm: + // FIXME: ensure we run on the correct relation in a multirelational + // setting! + Clustering<M> innerResult = innerkMeans.run(proxyDB); + // Add resulting clusters to current result. + currentClusterList.addAll(innerResult.getAllClusters()); + + if (prog != null) { + prog.incrementProcessed(LOG); + } + if (LOG.isVerbose()) { + LOG.verbose("Iteration " + j); + } + } + if (prog != null) { + prog.ensureCompleted(LOG); + } + + // add all current clusters to the result + Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means"); + for (Cluster<M> cluster : currentClusterList) { + result.addToplevelCluster(cluster); + } + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return innerkMeans.getInputTypeRestriction(); + } + + @Override + public DistanceFunction<? super V, D> getDistanceFunction() { + return innerkMeans.getDistanceFunction(); + } + + @Override + public void setK(int k) { + this.k = k; + } + + @Override + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + innerkMeans.setDistanceFunction(distanceFunction); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractParameterizer { + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID KMEANS_ID = new OptionID("bisecting.kmeansvariant", "KMeans variant"); + + /** + * Variant of kMeans + */ + protected KMeans<V, D, M> kMeansVariant; + + /** + * Desired number of clusters. + */ + protected int k; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + IntParameter kP = new IntParameter(KMeans.K_ID); + kP.addConstraint(new GreaterConstraint(1)); + if (config.grab(kP)) { + k = kP.intValue(); + } + + ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class); + if (config.grab(kMeansVariantP)) { + ListParameterization kMeansVariantParameters = new ListParameterization(); + + // We will always invoke this with k=2! + kMeansVariantParameters.addParameter(KMeans.K_ID, 2); + + ChainedParameterization combinedConfig = new ChainedParameterization(kMeansVariantParameters, config); + combinedConfig.errorsTo(config); + kMeansVariant = kMeansVariantP.instantiateClass(combinedConfig); + } + } + + @Override + protected KMeansBisecting<V, D, M> makeInstance() { + return new KMeansBisecting<>(k, kMeansVariant); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java index 54b3a2ce..06fb10c1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,6 +24,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.List; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; @@ -31,7 +33,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; * Interface for initializing K-Means * * @author Erich Schubert - * + * * @apiviz.landmark * * @param <V> Object type @@ -40,10 +42,12 @@ public interface KMeansInitialization<V> { /** * Choose initial means * + * @param database Database context * @param relation Relation * @param k Parameter k - * @param distanceFunction Distance function + * @param distanceFunction Distance function + * * @return List of chosen means for k-means */ - public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); + public abstract List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index f43c2277..e692293c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -36,19 +35,13 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the k-means algorithm, using Lloyd-style bulk iterations. @@ -90,28 +83,23 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten super(distanceFunction, k, maxiter, initializer); } - /** - * Run k-means. - * - * @param database Database - * @param relation relation to use - * @return result - */ + @Override public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (LOG.isVerbose()) { - LOG.verbose("K-Means iteration " + (iteration + 1)); + if (prog != null) { + prog.incrementProcessed(LOG); } boolean changed = assignToNearestCluster(relation, means, clusters); // Stop if no cluster assignment changed. @@ -121,12 +109,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten // Recompute means. means = means(clusters, means, relation); } + if (prog != null) { + prog.setCompleted(LOG); + } + // Wrap result final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { - KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); - result.addCluster(new Cluster<KMeansModel<V>>(clusters.get(i), model)); + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -143,53 +135,15 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { - /** - * k Parameter. - */ - protected int k; - - /** - * Number of iterations. - */ - protected int maxiter; - - /** - * Initialization method. - */ - protected KMeansInitialization<V> initializer; - + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { @Override - protected void makeOptions(Parameterization config) { - ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if(config.grab(distanceFunctionP)) { - distanceFunction = distanceFunctionP.instantiateClass(config); - if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { - LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); - } - } - - IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { - k = kP.getValue(); - } - - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { - initializer = initialP.instantiateClass(config); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { - maxiter = maxiterP.intValue(); - } + protected Logging getLogger() { + return LOG; } @Override protected KMeansLloyd<V, D> makeInstance() { - return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index 0cc7c363..bb689bd3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -37,20 +36,14 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the k-means algorithm, using MacQueen style incremental updates. @@ -89,24 +82,18 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex super(distanceFunction, k, maxiter, initializer); } - /** - * Run k-means. - * - * @param database Database - * @param relation relation to use - * @return Clustering result - */ + @Override public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = new ArrayList<Vector>(k); - for (NumberVector<?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { + List<Vector> means = new ArrayList<>(k); + for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) { means.add(nv.getColumnVector()); } // Initialize cluster and assign objects - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -114,22 +101,27 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex // Initial recomputation of the means. means = means(clusters, means, relation); + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; // Refine result for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (LOG.isVerbose()) { - LOG.verbose("K-Means iteration " + (iteration + 1)); + if (prog != null) { + prog.incrementProcessed(LOG); } boolean changed = macQueenIterate(relation, means, clusters); if (!changed) { break; } } + if (prog != null) { + prog.setCompleted(LOG); + } + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); - KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); - result.addCluster(new Cluster<KMeansModel<V>>(ids, model)); + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addToplevelCluster(new Cluster<>(ids, model)); } return result; } @@ -146,53 +138,15 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { - /** - * k Parameter. - */ - protected int k; - - /** - * Maximum number of iterations. - */ - protected int maxiter; - - /** - * Initialization method. - */ - protected KMeansInitialization<V> initializer; - + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { @Override - protected void makeOptions(Parameterization config) { - ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if (config.grab(distanceFunctionP)) { - distanceFunction = distanceFunctionP.instantiateClass(config); - if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { - LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); - } - } - - IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { - k = kP.getValue(); - } - - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { - initializer = initialP.instantiateClass(config); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); - } + protected Logging getLogger() { + return LOG; } @Override protected KMeansMacQueen<V, D> makeInstance() { - return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer); + return new KMeansMacQueen<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index a07953da..302ca86b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,8 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; @@ -70,17 +72,17 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("K-Means++ initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); // Chose first mean - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); Random random = rnd.getRandom(); DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter()); @@ -99,7 +101,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten } double r = random.nextDouble() * weightsum; int pos = 0; - while(r > 0 && pos < weights.length) { + while(r > 0 && pos < weights.length - 1) { r -= weights[pos]; pos++; } @@ -125,7 +127,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten @Override public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("PAM initialization can only be used with numerical distances."); + throw new AbortException("K-Means++ initialization initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; @@ -244,7 +246,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { - return new KMeansPlusPlusInitialMeans<V, D>(rnd); + return new KMeansPlusPlusInitialMeans<>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index 9917337e..cc7aaa9e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -39,13 +38,9 @@ import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the k-medians clustering algorithm, using Lloyd-style bulk @@ -83,28 +78,23 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext super(distanceFunction, k, maxiter, initializer); } - /** - * Run k-medians. - * - * @param database Database - * @param relation relation to use - * @return result - */ + @Override public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + return new Clustering<>("k-Medians Clustering", "kmedians-clustering"); } // Choose initial medians - List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (LOG.isVerbose()) { - LOG.verbose("K-Medians iteration " + (iteration + 1)); + if (prog != null) { + prog.incrementProcessed(LOG); } boolean changed = assignToNearestCluster(relation, medians, clusters); // Stop if no cluster assignment changed. @@ -114,12 +104,15 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext // Recompute medians. medians = medians(clusters, medians, relation); } + if (prog != null) { + prog.setCompleted(LOG); + } // Wrap result final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + Clustering<MeanModel<V>> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering"); for (int i = 0; i < clusters.size(); i++) { - MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); - result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + MeanModel<V> model = new MeanModel<>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -136,46 +129,15 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { - /** - * k Parameter. - */ - protected int k; - - /** - * Maximum number of iterations. - */ - protected int maxiter; - - /** - * Initialization method. - */ - protected KMeansInitialization<V> initializer; - + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { - k = kP.intValue(); - } - - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { - initializer = initialP.instantiateClass(config); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { - maxiter = maxiterP.intValue(); - } + protected Logging getLogger() { + return LOG; } @Override protected KMediansLloyd<V, D> makeInstance() { - return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index f4398458..87a0c7ae 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -46,6 +46,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; @@ -119,13 +120,13 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); // Choose initial medoids ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -135,9 +136,13 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista // TODO: reuse this information, from the build phase, when possible? assignToNearestCluster(medoids, mdists, clusters, distQ); + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null; // Swap phase boolean changed = true; while (changed) { + if (prog != null) { + prog.incrementProcessed(LOG); + } changed = false; // Try to swap the medoid with a better cluster member: int i = 0; @@ -168,12 +173,15 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista assignToNearestCluster(medoids, mdists, clusters, distQ); } } + if (prog != null) { + prog.setCompleted(LOG); + } // Wrap result - Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); for (int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); - result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -256,7 +264,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista k = kP.intValue(); } - ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } @@ -270,7 +278,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista @Override protected KMedoidsEM<V, D> makeInstance() { - return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer); + return new KMedoidsEM<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java index 269e7e9e..136a4129 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index 906501e4..1feda867 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -50,6 +50,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; @@ -124,14 +125,14 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); DBIDs ids = relation.getDBIDs(); // Choose initial medoids ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -141,9 +142,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist // TODO: reuse this information, from the build phase, when possible? assignToNearestCluster(medoids, ids, second, clusters, distQ); + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase boolean changed = true; while (changed) { + if (prog != null) { + prog.incrementProcessed(LOG); + } changed = false; // Try to swap the medoid with a better cluster member: double best = 0; @@ -189,6 +194,9 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } } } + if (prog != null) { + prog.setCompleted(LOG); + } if (LOG.isDebugging()) { LOG.debug("Best cost: " + best); } @@ -204,10 +212,10 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } // Wrap result - Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); for (int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); - result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -293,7 +301,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist k = kP.intValue(); } - ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } @@ -307,7 +315,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist @Override protected KMedoidsPAM<V, D> makeInstance() { - return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer); + return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java index 1fc7160e..c7e1751f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,6 +25,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; @@ -69,16 +71,16 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("PAM initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + final DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); DBIDs medids = chooseInitialMedoids(k, distQ); - List<V> medoids = new ArrayList<V>(k); + List<V> medoids = new ArrayList<>(k); for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) { medoids.add(relation.get(iter)); } @@ -179,7 +181,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer { @Override protected PAMInitialMeans<V, D> makeInstance() { - return new PAMInitialMeans<V, D>(); + return new PAMInitialMeans<>(); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java index 78e59be7..214f4ce6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,6 +25,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; @@ -52,9 +54,9 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd); - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { means.add(relation.get(iter)); } @@ -74,10 +76,9 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< * @apiviz.exclude */ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> { - @Override protected RandomlyChosenInitialMeans<V> makeInstance() { - return new RandomlyChosenInitialMeans<V>(rnd); + return new RandomlyChosenInitialMeans<>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index 300f5cb0..ee90e0dc 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,6 +27,7 @@ import java.util.List; import java.util.Random; import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; @@ -54,11 +55,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { final int dim = RelationUtil.dimensionality(relation); NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); final Random random = rnd.getRandom(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); @@ -81,7 +82,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab public static class Parameterizer<V extends NumberVector<?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected RandomlyGeneratedInitialMeans<V> makeInstance() { - return new RandomlyGeneratedInitialMeans<V>(rnd); + return new RandomlyGeneratedInitialMeans<>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java new file mode 100644 index 00000000..9f0a1923 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java @@ -0,0 +1,160 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ProxyDatabase; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.ProxyView; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Initialize k-means by running k-means on a sample of the data set only. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization<V> { + /** + * Variant of kMeans for the bisecting step. + */ + private KMeans<V, D, ?> innerkMeans; + + /** + * Sample size. + */ + private double rate; + + /** + * Constructor. + * + * @param rnd Random generator. + * @param innerkMeans Inner k-means algorithm. + * @param rate Sampling rate. + */ + public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, D, ?> innerkMeans, double rate) { + super(rnd); + this.innerkMeans = innerkMeans; + this.rate = rate; + } + + @Override + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + final int samplesize = (int) Math.ceil(rate * relation.size()); + final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), samplesize, rnd); + + ProxyView<V> proxyv = new ProxyView<>(database, sample, relation); + ProxyDatabase proxydb = new ProxyDatabase(sample, proxyv); + + innerkMeans.setK(k); + @SuppressWarnings("unchecked") + PrimitiveDistanceFunction<? super NumberVector<?>, D> df = (PrimitiveDistanceFunction<? super NumberVector<?>, D>) distanceFunction; + innerkMeans.setDistanceFunction(df); + Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv); + List<V> means = new ArrayList<>(); + for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) { + means.add((V) cluster.getModel().getMean()); + } + + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <D> Distance type + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization.Parameterizer<V> { + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID KMEANS_ID = new OptionID("kmeans.algorithm", "KMeans variant to run multiple times."); + + /** + * Parameter to specify the sampling rate. + */ + public static final OptionID SAMPLE_ID = new OptionID("kmeans.samplesize", "Sample set size (if > 1) or sampling rante (if < 1)."); + + /** + * Inner k-means algorithm to use. + */ + protected KMeans<V, D, ?> innerkMeans; + + /** + * Sampling rate. + */ + protected double rate; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<KMeans<V, D, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); + if (config.grab(kMeansVariantP)) { + ListParameterization kMeansVariantParameters = new ListParameterization(); + + // We will always invoke this with k as requested from outside! + kMeansVariantParameters.addParameter(KMeans.K_ID, 13); + kMeansVariantParameters.addParameter(KMeans.DISTANCE_FUNCTION_ID, SquaredEuclideanDistanceFunction.class); + + ChainedParameterization combinedConfig = new ChainedParameterization(kMeansVariantParameters, config); + combinedConfig.errorsTo(config); + innerkMeans = kMeansVariantP.instantiateClass(combinedConfig); + } + + DoubleParameter sampleP = new DoubleParameter(SAMPLE_ID); + if (config.grab(sampleP)) { + rate = sampleP.doubleValue(); + } + } + + @Override + protected SampleKMeansInitialization<V, D> makeInstance() { + return new SampleKMeansInitialization<>(rnd, innerkMeans, rate); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java index 2ce625b0..aa4c3e24 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java new file mode 100644 index 00000000..f2de7846 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java @@ -0,0 +1,54 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; + +/** + * Interface for computing the quality of a K-Means clustering. + * + * @author Erich Schubert + * + * @param <O> Input Object restriction type + * @param <D> Distance restriction type + */ +public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Distance<?>> { + /** + * Calculates and returns the quality measure. + * + * @param clustering Clustering to analyze + * @param distanceFunction Distance function to use (usually Euclidean or + * squared Euclidean!) + * @param relation Relation for accessing objects + * @param <V> Actual vector type (could be a subtype of O!) + * + * @return quality measure + */ + <V extends O> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends D> distanceFunction, Relation<V> relation); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java new file mode 100644 index 00000000..e0ddfff0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java @@ -0,0 +1,89 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; + +/** + * Class for computing the average overall distance. + * + * The average of all average pairwise distances in a cluster. + * + * @author Stephan Baier + */ +public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> { + @Override + public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) { + @SuppressWarnings("unchecked") + final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters(); + + if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction; + double clusterDistanceSum = 0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + + // Compute sum of pairwise distances: + double clusterPairwiseDistanceSum = 0; + for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { + V obj1 = relation.get(iter1); + for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + clusterPairwiseDistanceSum += df.doubleDistance(obj1, relation.get(iter2)); + } + } + clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); + } + + return clusterDistanceSum / clusterList.size(); + } else { + double clusterDistanceSum = 0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + + // Compute sum of pairwise distances: + double clusterPairwiseDistanceSum = 0; + for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { + V obj1 = relation.get(iter1); + for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2)).doubleValue(); + } + } + clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); + } + + return clusterDistanceSum / clusterList.size(); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java new file mode 100644 index 00000000..32ad5210 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java @@ -0,0 +1,83 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; + +/** + * Class for computing the variance in a clustering result (sum-of-squares). + * + * @author Stephan Baier + */ +public class WithinClusterVarianceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> { + @Override + public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) { + @SuppressWarnings("unchecked") + final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters(); + + boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction); + if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction; + double variance = 0.0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + V mean = cluster.getModel().getMean(); + + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double dist = df.doubleDistance(relation.get(iter), mean); + if (squared) { + variance += dist; + } else { + variance += dist * dist; + } + } + } + return variance; + } else { + double variance = 0.0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + V mean = cluster.getModel().getMean(); + + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double dist = distanceFunction.distance(relation.get(iter), mean).doubleValue(); + variance += dist * dist; + } + } + return variance; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java new file mode 100644 index 00000000..ed9a528d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java @@ -0,0 +1,4 @@ +/** + * Quality measures for k-Means results. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
\ No newline at end of file |