diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans')
39 files changed, 3212 insertions, 1219 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index 5754e961..bdfc2f04 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,14 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.RandomlyChosenInitialMeans; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -43,11 +46,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; @@ -60,28 +62,26 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @author Erich Schubert * - * @apiviz.has MeanModel * @apiviz.composedOf KMeansInitialization * * @param <V> Vector type - * @param <D> Distance type * @param <M> Cluster model type */ -public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans<V, D, M>, ClusteringAlgorithm<Clustering<M>> { +public abstract class AbstractKMeans<V extends NumberVector, M extends Model> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector, Clustering<M>> implements KMeans<V, M>, ClusteringAlgorithm<Clustering<M>> { /** - * Holds the value of {@link #K_ID}. + * Number of cluster centers to initialize. */ protected int k; /** - * Holds the value of {@link #MAXITER_ID}. + * Maximum number of iterations */ protected int maxiter; /** * Method to choose initial means. */ - protected KMeansInitialization<V> initializer; + protected KMeansInitialization<? super V> initializer; /** * Constructor. @@ -91,7 +91,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param maxiter Maxiter parameter * @param initializer Function to generate the initial means */ - public AbstractKMeans(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public AbstractKMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { super(distanceFunction); this.k = k; this.maxiter = maxiter; @@ -106,43 +106,26 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param means a list of k means * @param clusters cluster assignment * @param assignment Current cluster assignment + * @param varsum Variance sum output * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { + protected boolean assignToNearestCluster(Relation<? extends V> relation, List<? extends NumberVector> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum) { boolean changed = false; - - if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { - @SuppressWarnings("unchecked") - final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(iditer); - int minIndex = 0; - for(int i = 0; i < k; i++) { - double dist = df.doubleDistance(fv, means.get(i)); - if(dist < mindist) { - minIndex = i; - mindist = dist; - } + Arrays.fill(varsum, 0.); + final PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(iditer); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.distance(fv, means.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; } - changed |= updateAssignment(iditer, clusters, assignment, minIndex); - } - } - else { - final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(iditer); - int minIndex = 0; - for(int i = 0; i < k; i++) { - D dist = df.distance(fv, means.get(i)); - if(dist.compareTo(mindist) < 0) { - minIndex = i; - mindist = dist; - } - } - changed |= updateAssignment(iditer, clusters, assignment, minIndex); } + varsum[minIndex] += mindist; + changed |= updateAssignment(iditer, clusters, assignment, minIndex); } return changed; } @@ -173,7 +156,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) { + protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector> means, Relation<V> database) { // TODO: use Kahan summation for better numerical precision? List<Vector> newMeans = new ArrayList<>(k); for(int i = 0; i < k; i++) { @@ -187,7 +170,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan iter.advance(); // Update with remaining instances for(; iter.valid(); iter.advance()) { - NumberVector<?> vec = database.get(iter); + NumberVector vec = database.get(iter); for(int j = 0; j < mean.getDimensionality(); j++) { raw[j] += vec.doubleValue(j); } @@ -211,24 +194,23 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List<NumberVector<?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> medians, Relation<V> database) { + protected List<Vector> medians(List<? extends ModifiableDBIDs> clusters, List<Vector> medians, Relation<V> database) { final int dim = medians.get(0).getDimensionality(); final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); - List<NumberVector<?>> newMedians = new ArrayList<>(k); + List<Vector> newMedians = new ArrayList<>(k); for(int i = 0; i < k; i++) { ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); - if(list.size() > 0) { - Vector mean = new Vector(dim); - for(int d = 0; d < dim; d++) { - sorter.setDimension(d); - DBID id = QuickSelect.median(list, sorter); - mean.set(d, database.get(id).doubleValue(d)); - } - newMedians.add(mean); + if(list.size() <= 0) { + newMedians.add(medians.get(i)); + continue; } - else { - newMedians.add((NumberVector<?>) medians.get(i)); + Vector mean = new Vector(dim); + for(int d = 0; d < dim; d++) { + sorter.setDimension(d); + DBID id = QuickSelect.median(list, sorter); + mean.set(d, database.get(id).doubleValue(d)); } + newMedians.add(mean); } return newMedians; } @@ -256,49 +238,30 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param means Means * @param clusters Clusters * @param assignment Current cluster assignment + * @param varsum Variance sum output * @return true when the means have changed */ - protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { + protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum) { boolean changed = false; - - if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { - // Raw distance function - @SuppressWarnings("unchecked") - final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - - // Incremental update - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(iditer); - int minIndex = 0; - for(int i = 0; i < k; i++) { - double dist = df.doubleDistance(fv, means.get(i)); - if(dist < mindist) { - minIndex = i; - mindist = dist; - } - } - changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); - } - } - else { - // Raw distance function - final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - - // Incremental update - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(iditer); - int minIndex = 0; - for(int i = 0; i < k; i++) { - D dist = df.distance(fv, means.get(i)); - if(dist.compareTo(mindist) < 0) { - minIndex = i; - mindist = dist; - } + Arrays.fill(varsum, 0.); + + // Raw distance function + final PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction(); + + // Incremental update + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(iditer); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.distance(fv, means.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; } - changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); } + varsum[minIndex] += mindist; + changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); } return changed; } @@ -339,18 +302,36 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan } @Override - public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction) { this.distanceFunction = distanceFunction; } /** + * Log statistics on the variance sum. + * + * @param varstat Statistics log instance + * @param varsum Variance sum per cluster + */ + protected void logVarstat(DoubleStatistic varstat, double[] varsum) { + if(varstat == null) { + return; + } + double s = 0.; + for(double v : varsum) { + s += v; + } + varstat.setDouble(s); + getLogger().statistics(varstat); + } + + /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ - public abstract static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { + public abstract static class Parameterizer<V extends NumberVector> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector> { /** * k Parameter. */ @@ -368,25 +349,58 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan @Override protected void makeOptions(Parameterization config) { - ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if(config.grab(distanceFunctionP)) { - distanceFunction = distanceFunctionP.instantiateClass(config); - if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { - getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); - } - } + getParameterK(config); + getParameterInitialization(config); + getParameterDistanceFunction(config); + getParameterMaxIter(config); + } + /** + * Get the k parameter. + * + * @param config Parameterization + */ + protected void getParameterK(Parameterization config) { IntParameter kP = new IntParameter(K_ID); kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(kP)) { k = kP.getValue(); } + } + /** + * Get the distance function parameter. + * + * @param config Parameterization + */ + protected void getParameterDistanceFunction(Parameterization config) { + ObjectParameter<PrimitiveDistanceFunction<NumberVector>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); + if(config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); + } + } + } + + /** + * Get the initialization method parameter. + * + * @param config Parameterization + */ + protected void getParameterInitialization(Parameterization config) { ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class); if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } + } + /** + * Get the max iterations parameter. + * + * @param config Parameterization + */ + protected void getParameterMaxIter(Parameterization config) { IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); if(config.grab(maxiterP)) { @@ -402,6 +416,6 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan abstract protected Logging getLogger(); @Override - abstract protected AbstractKMeans<V, D, ?> makeInstance(); + abstract protected AbstractKMeans<V, ?> makeInstance(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java index 51e7ace9..1c9c6a71 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,7 +32,6 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -50,10 +49,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Erich Schubert * * @param <V> Vector type - * @param <D> Distance type * @param <M> Model type */ -public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> { +public class BestOfMultipleKMeans<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, M> { /** * The logger for this class. */ @@ -67,12 +65,12 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< /** * Variant of kMeans for the bisecting step. */ - private KMeans<V, D, M> innerkMeans; + private KMeans<V, M> innerkMeans; /** * Quality measure which should be used. */ - private KMeansQualityMeasure<? super V, ? super D> qualityMeasure; + private KMeansQualityMeasure<? super V> qualityMeasure; /** * Constructor. @@ -81,7 +79,7 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< * @param innerkMeans K-Means variant to actually use. * @param qualityMeasure Quality measure */ - public BestOfMultipleKMeans(int trials, KMeans<V, D, M> innerkMeans, KMeansQualityMeasure<? super V, ? super D> qualityMeasure) { + public BestOfMultipleKMeans(int trials, KMeans<V, M> innerkMeans, KMeansQualityMeasure<? super V> qualityMeasure) { super(); this.trials = trials; this.innerkMeans = innerkMeans; @@ -93,34 +91,27 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< if(!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass()); } - final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction(); + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super NumberVector> df = (PrimitiveDistanceFunction<? super NumberVector>) innerkMeans.getDistanceFunction(); + Clustering<M> bestResult = null; - if(trials > 1) { - double bestCost = Double.POSITIVE_INFINITY; - FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null; - for(int i = 0; i < trials; i++) { - Clustering<M> currentCandidate = innerkMeans.run(database, relation); - double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation); - - if(LOG.isVerbose()) { - LOG.verbose("Cost of candidate " + i + ": " + currentCost); - } - - if(currentCost < bestCost) { - bestResult = currentCandidate; - bestCost = currentCost; - } - if(prog != null) { - prog.incrementProcessed(LOG); - } + double bestCost = Double.NaN; + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null; + for(int i = 0; i < trials; i++) { + Clustering<M> currentCandidate = innerkMeans.run(database, relation); + double currentCost = qualityMeasure.quality(currentCandidate, df, relation); + + if(LOG.isVerbose()) { + LOG.verbose("Cost of candidate " + i + ": " + currentCost); } - if(prog != null) { - prog.ensureCompleted(LOG); + + if(qualityMeasure.isBetter(currentCost, bestCost)) { + bestResult = currentCandidate; + bestCost = currentCost; } + LOG.incrementProcessed(prog); } - else { - bestResult = innerkMeans.run(database); - } + LOG.ensureCompleted(prog); return bestResult; } @@ -131,7 +122,7 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< } @Override - public DistanceFunction<? super V, D> getDistanceFunction() { + public DistanceFunction<? super V> getDistanceFunction() { return innerkMeans.getDistanceFunction(); } @@ -141,7 +132,7 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< } @Override - public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction) { innerkMeans.setDistanceFunction(distanceFunction); } @@ -159,10 +150,9 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< * @apiviz.exclude * * @param <V> Vector type - * @param <D> Distance type * @param <M> Model type */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer { /** * Parameter to specify the iterations of the bisecting step. */ @@ -186,12 +176,12 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< /** * Variant of kMeans to use. */ - protected KMeans<V, D, M> kMeansVariant; + protected KMeans<V, M> kMeansVariant; /** * Quality measure. */ - protected KMeansQualityMeasure<? super V, ? super D> qualityMeasure; + protected KMeansQualityMeasure<? super V> qualityMeasure; @Override protected void makeOptions(Parameterization config) { @@ -201,19 +191,19 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< trials = trialsP.intValue(); } - ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); + ObjectParameter<KMeans<V, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); if(config.grab(kMeansVariantP)) { kMeansVariant = kMeansVariantP.instantiateClass(config); } - ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class); + ObjectParameter<KMeansQualityMeasure<V>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class); if(config.grab(qualityMeasureP)) { qualityMeasure = qualityMeasureP.instantiateClass(config); } } @Override - protected BestOfMultipleKMeans<V, D, M> makeInstance() { + protected BestOfMultipleKMeans<V, M> makeInstance() { return new BestOfMultipleKMeans<>(trials, kMeansVariant, qualityMeasure); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java new file mode 100644 index 00000000..0a47deba --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/CLARA.java @@ -0,0 +1,244 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * Clustering Large Applications (CLARA) is a clustering method for large data + * sets based on PAM, partitioning around medoids ({@link KMedoidsPAM}) based on + * sampling. + * + * Reference: + * <p> + * L. Kaufman, P. J. Rousseeuw<br /> + * Clustering Large Data Sets (with discussion)<br /> + * in: Pattern Recognition in Practice II + * </p> + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +@Reference(authors = "L. Kaufman, P. J. Rousseeuw",// +title = "Clustering Large Data Sets (with discussion)", // +booktitle = "Pattern Recognition in Practice II") +public class CLARA<V> extends KMedoidsPAM<V> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(CLARA.class); + + /** + * Sampling rate. If less than 1, it is considered to be a relative value. + */ + double sampling; + + /** + * Number of samples to draw (i.e. iterations). + */ + int numsamples; + + /** + * Random factory for initialization. + */ + RandomFactory random; + + public CLARA(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer, int numsamples, double sampling, RandomFactory random) { + super(distanceFunction, k, maxiter, initializer); + this.numsamples = numsamples; + this.sampling = sampling; + this.random = random; + } + + @Override + public Clustering<MedoidModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<>("CLARA Clustering", "clara-clustering"); + } + DBIDs ids = relation.getDBIDs(); + int sampleSize = (int) ((sampling < 1.) ? sampling * ids.size() : sampling); + DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + + double best = Double.POSITIVE_INFINITY; + ArrayModifiableDBIDs bestmedoids = null; + List<ModifiableDBIDs> bestclusters = null; + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Random samples.", numsamples, LOG) : null; + for(int j = 0; j < numsamples; j++) { + DBIDs rids = DBIDUtil.randomSample(ids, sampleSize, random); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, rids, distQ)); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + runPAMOptimization(distQ, rids, medoids, clusters); + double score = assignRemainingToNearestCluster(medoids, ids, rids, clusters, distQ); + if(score < best) { + best = score; + bestmedoids = medoids; + bestclusters = clusters; + } + LOG.incrementProcessed(prog); + } + LOG.ensureCompleted(prog); + + // Wrap result + Clustering<MedoidModel> result = new Clustering<>("CLARA Clustering", "clara-clustering"); + for(int i = 0; i < bestclusters.size(); i++) { + MedoidModel model = new MedoidModel(bestmedoids.get(i)); + result.addToplevelCluster(new Cluster<>(bestclusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param means Object centroids + * @param ids Object ids + * @param rids Sample that was already assigned + * @param clusters cluster assignment + * @param distQ distance query + * @return Sum of distances. + */ + protected double assignRemainingToNearestCluster(ArrayDBIDs means, DBIDs ids, DBIDs rids, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) { + rids = DBIDUtil.ensureSet(rids); // Ensure we have fast contains + double distsum = 0.; + DBIDArrayIter miter = means.iter(); + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + if(rids.contains(iditer)) { + continue; + } + double mindist = Double.POSITIVE_INFINITY; + int minIndex = 0; + miter.seek(0); // Reuse iterator. + for(int i = 0; miter.valid(); miter.advance(), i++) { + double dist = distQ.distance(iditer, miter); + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + distsum += mindist; + clusters.get(minIndex).add(iditer); + } + return distsum; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V> extends KMedoidsPAM.Parameterizer<V> { + /** + * The number of samples to run. + */ + public static final OptionID NUMSAMPLES_ID = new OptionID("clara.samples", "Number of samples (iterations) to run."); + + /** + * The sample size. + */ + public static final OptionID SAMPLESIZE_ID = new OptionID("clara.samplesize", "The size of the sample."); + + /** + * Random generator. + */ + public static final OptionID RANDOM_ID = new OptionID("clara.random", "Random generator seed."); + + /** + * Sampling rate. If less than 1, it is considered to be a relative value. + */ + double sampling; + + /** + * Number of samples to draw (i.e. iterations). + */ + int numsamples; + + /** + * Random factory for initialization. + */ + RandomFactory random; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter numsamplesP = new IntParameter(NUMSAMPLES_ID, 5) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(numsamplesP)) { + numsamples = numsamplesP.intValue(); + } + + DoubleParameter samplingP = new DoubleParameter(SAMPLESIZE_ID) // + .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + if(config.grab(samplingP)) { + sampling = samplingP.doubleValue(); + } + + RandomParameter randomP = new RandomParameter(RANDOM_ID); + if(config.grab(randomP)) { + random = randomP.getValue(); + } + } + + @Override + protected CLARA<V> makeInstance() { + return new CLARA<>(distanceFunction, k, maxiter, initializer, numsamples, sampling, random); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java index 29c0a5c8..32d7a978 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,11 +27,10 @@ import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; /** @@ -40,10 +39,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; * @author Erich Schubert * * @param <V> Number vector type - * @param <D> Distance type * @param <M> Actual model type */ -public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V, D> { +public interface KMeans<V extends NumberVector, M extends Model> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V> { /** * Parameter to specify the initialization method */ @@ -81,11 +79,11 @@ public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M exte * @param k K parameter */ void setK(int k); - + /** * Set the distance function to use. * * @param distanceFunction Distance function. */ - void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction); + void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java index aec4fe0f..57c2a295 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -43,13 +44,12 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -57,7 +57,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; /** - * Provides the k-means algorithm, using Lloyd-style bulk iterations. + * An algorithm for k-means, using Lloyd-style bulk iterations. * * However, in contrast to Lloyd's k-means and similar to MacQueen, we do update * the mean vectors multiple times, not only at the very end of the iteration. @@ -71,9 +71,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * @apiviz.has KMeansModel * * @param <V> vector datatype - * @param <D> distance value type */ -public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { +public class KMeansBatchedLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { /** * The logger for this class. */ @@ -99,26 +98,21 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> * @param blocks Number of blocks * @param random Random factory used for partitioning. */ - public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer, int blocks, RandomFactory random) { + public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer, int blocks, RandomFactory random) { super(distanceFunction, k, maxiter, initializer); this.blocks = blocks; this.random = random; } @Override - public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { final int dim = RelationUtil.dimensionality(relation); // Choose initial means - List<? extends NumberVector<?>> mvs = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); - // Convert to (modifiable) math vectors. - List<Vector> means = new ArrayList<>(k); - for (NumberVector<?> m : mvs) { - means.add(m.getColumnVector()); - } + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); @@ -127,45 +121,44 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> double[][] meanshift = new double[k][dim]; int[] changesize = new int[k]; + double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; - for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (prog != null) { - prog.incrementProcessed(LOG); - } + DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + LOG.incrementProcessed(prog); boolean changed = false; FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null; - for (int p = 0; p < parts.length; p++) { + for(int p = 0; p < parts.length; p++) { // Initialize new means scratch space. - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { Arrays.fill(meanshift[i], 0.); } Arrays.fill(changesize, 0); - changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment); + Arrays.fill(varsum, 0.); + changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment, varsum); // Recompute means. updateMeans(means, meanshift, clusters, changesize); - if (pprog != null) { - pprog.incrementProcessed(LOG); - } - } - if (pprog != null) { - pprog.ensureCompleted(LOG); + LOG.incrementProcessed(pprog); } + LOG.ensureCompleted(pprog); + logVarstat(varstat, varsum); // Stop if no cluster assignment changed. - if (!changed) { + if(!changed) { break; } } - if (prog != null) { - prog.setCompleted(LOG); - } + LOG.setCompleted(prog); // Wrap result - final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); - for (int i = 0; i < clusters.size(); i++) { - KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); - result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + if(ids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(ids, model)); } return result; } @@ -181,42 +174,26 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> * @param changesize New cluster sizes * @param clusters cluster assignment * @param assignment Current cluster assignment + * @param varsum Sum of variances * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector<?>> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { + protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum) { boolean changed = false; - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { - @SuppressWarnings("unchecked") - final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { - double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(iditer); - int minIndex = 0; - for (int i = 0; i < k; i++) { - double dist = df.doubleDistance(fv, oldmeans.get(i)); - if (dist < mindist) { - minIndex = i; - mindist = dist; - } - } - changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); - } - } else { - final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { - D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(iditer); - int minIndex = 0; - for (int i = 0; i < k; i++) { - D dist = df.distance(fv, oldmeans.get(i)); - if (dist.compareTo(mindist) < 0) { - minIndex = i; - mindist = dist; - } + final PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction(); + for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(iditer); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.distance(fv, oldmeans.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; } - changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); } + varsum[minIndex] += mindist; + changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); } return changed; } @@ -235,7 +212,7 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> */ protected boolean updateAssignment(DBIDIter id, V fv, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[][] meanshift, int[] changesize, int minIndex) { int cur = assignment.intValue(id); - if (cur == minIndex) { + if(cur == minIndex) { return false; } // Add to new cluster. @@ -243,16 +220,16 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> clusters.get(minIndex).add(id); changesize[minIndex]++; double[] raw = meanshift[minIndex]; - for (int j = 0; j < fv.getDimensionality(); j++) { + for(int j = 0; j < fv.getDimensionality(); j++) { raw[j] += fv.doubleValue(j); } } // Remove from previous cluster - if (cur >= 0) { + if(cur >= 0) { clusters.get(cur).remove(id); changesize[cur]--; double[] raw = meanshift[cur]; - for (int j = 0; j < fv.getDimensionality(); j++) { + for(int j = 0; j < fv.getDimensionality(); j++) { raw[j] -= fv.doubleValue(j); } } @@ -269,16 +246,16 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> * @param changesize Size of change (for weighting!) */ protected void updateMeans(List<Vector> means, double[][] meanshift, List<ModifiableDBIDs> clusters, int[] changesize) { - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { int newsize = clusters.get(i).size(), oldsize = newsize - changesize[i]; - if (newsize == 0) { + if(newsize == 0) { continue; // Keep previous mean vector. } - if (oldsize == 0) { + if(oldsize == 0) { means.set(i, new Vector(meanshift[i]).times(1. / newsize)); continue; // Replace with new vector. } - if (oldsize == newsize) { + if(oldsize == newsize) { means.get(i).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize); continue; } @@ -298,7 +275,7 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { /** * Parameter for the number of blocks. */ @@ -324,11 +301,11 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> super.makeOptions(config); IntParameter blocksP = new IntParameter(BLOCKS_ID, 10); blocksP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); - if (config.grab(blocksP)) { + if(config.grab(blocksP)) { blocks = blocksP.intValue(); } RandomParameter randomP = new RandomParameter(RANDOM_ID); - if (config.grab(randomP)) { + if(config.grab(randomP)) { random = randomP.getValue(); } } @@ -339,7 +316,7 @@ public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D> } @Override - protected KMeansBatchedLloyd<V, D> makeInstance() { + protected KMeansBatchedLloyd<V> makeInstance() { return new KMeansBatchedLloyd<>(distanceFunction, k, maxiter, initializer, blocks, random); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java index 80a581b1..c6361550 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -35,7 +35,6 @@ import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -63,11 +62,10 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Stephan Baier * * @param <V> Vector type - * @param <D> Distance type * @param <M> Model type */ @Reference(authors = "M. Steinbach, G. Karypis, V. Kumar", title = "A Comparison of Document Clustering Techniques", booktitle = "KDD workshop on text mining. Vol. 400. No. 1") -public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> { +public class KMeansBisecting<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, M> { /** * The logger for this class. */ @@ -76,7 +74,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M /** * Variant of kMeans for the bisecting step. */ - private KMeans<V, D, M> innerkMeans; + private KMeans<V, M> innerkMeans; /** * Desired value of k. @@ -89,7 +87,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M * @param k k parameter - number of result clusters * @param innerkMeans KMeans variant parameter - for bisecting step */ - public KMeansBisecting(int k, KMeans<V, D, M> innerkMeans) { + public KMeansBisecting(int k, KMeans<V, M> innerkMeans) { super(); this.k = k; this.innerkMeans = innerkMeans; @@ -128,16 +126,12 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M // Add resulting clusters to current result. currentClusterList.addAll(innerResult.getAllClusters()); - if (prog != null) { - prog.incrementProcessed(LOG); - } + LOG.incrementProcessed(prog); if (LOG.isVerbose()) { LOG.verbose("Iteration " + j); } } - if (prog != null) { - prog.ensureCompleted(LOG); - } + LOG.ensureCompleted(prog); // add all current clusters to the result Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means"); @@ -153,7 +147,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M } @Override - public DistanceFunction<? super V, D> getDistanceFunction() { + public DistanceFunction<? super V> getDistanceFunction() { return innerkMeans.getDistanceFunction(); } @@ -163,7 +157,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M } @Override - public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector> distanceFunction) { innerkMeans.setDistanceFunction(distanceFunction); } @@ -181,10 +175,9 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M * @apiviz.exclude * * @param <V> Vector type - * @param <D> Distance type * @param <M> Model type */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer { /** * Parameter to specify the kMeans variant. */ @@ -193,7 +186,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M /** * Variant of kMeans */ - protected KMeans<V, D, M> kMeansVariant; + protected KMeans<V, M> kMeansVariant; /** * Desired number of clusters. @@ -210,7 +203,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M k = kP.intValue(); } - ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class); + ObjectParameter<KMeans<V, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class); if (config.grab(kMeansVariantP)) { ListParameterization kMeansVariantParameters = new ListParameterization(); @@ -224,7 +217,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M } @Override - protected KMeansBisecting<V, D, M> makeInstance() { + protected KMeansBisecting<V, M> makeInstance() { return new KMeansBisecting<>(k, kMeansVariant); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java index 2a60ef27..a978745a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -35,28 +36,26 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** - * Provides the k-means algorithm, alternating between MacQueen-style - * incremental processing and Lloyd-Style batch steps. + * A hybrid k-means algorithm, alternating between MacQueen-style incremental + * processing and Lloyd-Style batch steps. * * @author Erich Schubert * - * @apiviz.landmark * @apiviz.has KMeansModel * * @param <V> vector datatype - * @param <D> distance value type */ -public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { +public class KMeansHybridLloydMacQueen<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { /** * The logger for this class. */ @@ -70,61 +69,59 @@ public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Dist * @param maxiter Maxiter parameter * @param initializer Initialization method */ - public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { super(distanceFunction, k, maxiter, initializer); } @Override - public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = new ArrayList<>(k); - for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) { - means.add(nv.getColumnVector()); - } + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; - for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) { + DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) { { // MacQueen - if (prog != null) { - prog.incrementProcessed(LOG); - } - boolean changed = macQueenIterate(relation, means, clusters, assignment); - if (!changed) { + LOG.incrementProcessed(prog); + boolean changed = macQueenIterate(relation, means, clusters, assignment, varsum); + logVarstat(varstat, varsum); + if(!changed) { break; } } { // Lloyd - if (prog != null) { - prog.incrementProcessed(LOG); - } - boolean changed = assignToNearestCluster(relation, means, clusters, assignment); + LOG.incrementProcessed(prog); + boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); + logVarstat(varstat, varsum); // Stop if no cluster assignment changed. - if (!changed) { + if(!changed) { break; } // Recompute means. means = means(clusters, means, relation); } } - if (prog != null) { - prog.setCompleted(LOG); - } + LOG.setCompleted(prog); // Wrap result - final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); - for (int i = 0; i < clusters.size(); i++) { - KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); - result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + if(ids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(ids, model)); } return result; } @@ -141,14 +138,14 @@ public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Dist * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { @Override protected Logging getLogger() { return LOG; } @Override - protected KMeansHybridLloydMacQueen<V, D> makeInstance() { + protected KMeansHybridLloydMacQueen<V> makeInstance() { return new KMeansHybridLloydMacQueen<>(distanceFunction, k, maxiter, initializer); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index 686e2076..ed92190d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -35,19 +36,20 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; /** - * Provides the k-means algorithm, using Lloyd-style bulk iterations. + * The standard k-means algorithm, using Lloyd-style bulk iterations. * * <p> * Reference:<br /> @@ -63,12 +65,14 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * @apiviz.has KMeansModel * * @param <V> vector datatype - * @param <D> distance value type */ @Title("K-Means") -@Description("Finds a partitioning into k clusters.") -@Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489") -public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { +@Description("Finds a least-squared partitioning into k clusters.") +@Reference(authors = "S. Lloyd", // +title = "Least squares quantization in PCM", // +booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", // +url = "http://dx.doi.org/10.1109/TIT.1982.1056489") +public class KMeansLloyd<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { /** * The logger for this class. */ @@ -82,47 +86,49 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten * @param maxiter Maxiter parameter * @param initializer Initialization method */ - public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public KMeansLloyd(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { super(distanceFunction, k, maxiter, initializer); } @Override - public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; - for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (prog != null) { - prog.incrementProcessed(LOG); - } - boolean changed = assignToNearestCluster(relation, means, clusters, assignment); + DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + LOG.incrementProcessed(prog); + boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum); + logVarstat(varstat, varsum); // Stop if no cluster assignment changed. - if (!changed) { + if(!changed) { break; } // Recompute means. means = means(clusters, means, relation); } - if (prog != null) { - prog.setCompleted(LOG); - } + LOG.setCompleted(prog); // Wrap result - final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); - for (int i = 0; i < clusters.size(); i++) { - KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); - result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + if(ids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(ids, model)); } return result; } @@ -139,14 +145,14 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { @Override protected Logging getLogger() { return LOG; } @Override - protected KMeansLloyd<V, D> makeInstance() { + protected KMeansLloyd<V> makeInstance() { return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index a0f4bb3f..7d2f805a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -38,18 +39,22 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; /** - * Provides the k-means algorithm, using MacQueen style incremental updates. + * The original k-means algorithm, using MacQueen style incremental updates; + * making this effectively an "online" (streaming) algorithm. + * + * This implementation will by default iterate over the data set until + * convergence, although MacQueen likely only meant to do a single pass over the + * data. * * <p> * Reference:<br /> @@ -62,12 +67,14 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * @apiviz.has KMeansModel * * @param <V> vector type to use - * @param <D> distance function value type */ @Title("K-Means") -@Description("Finds a partitioning into k clusters.") -@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992") -public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { +@Description("Finds a least-squares partitioning into k clusters.") +@Reference(authors = "J. MacQueen", // +title = "Some Methods for Classification and Analysis of Multivariate Observations", // +booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", // +url = "http://projecteuclid.org/euclid.bsmsp/1200512992") +public class KMeansMacQueen<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { /** * The logger for this class. */ @@ -81,47 +88,44 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex * @param maxiter Maxiter parameter * @param initializer Initialization method */ - public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { super(distanceFunction, k, maxiter, initializer); } @Override - public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = new ArrayList<>(k); - for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) { - means.add(nv.getColumnVector()); - } - // Initialize cluster and assign objects + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; - // Refine result - for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (prog != null) { - prog.incrementProcessed(LOG); - } - boolean changed = macQueenIterate(relation, means, clusters, assignment); - if (!changed) { + DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null; + // Iterate MacQueen + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + LOG.incrementProcessed(prog); + boolean changed = macQueenIterate(relation, means, clusters, assignment, varsum); + logVarstat(varstat, varsum); + if(!changed) { break; } } - if (prog != null) { - prog.setCompleted(LOG); - } + LOG.setCompleted(prog); - final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); - for (int i = 0; i < clusters.size(); i++) { + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); - KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getArrayRef())); + if(ids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); result.addToplevelCluster(new Cluster<>(ids, model)); } return result; @@ -139,14 +143,14 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { @Override protected Logging getLogger() { return LOG; } @Override - protected KMeansMacQueen<V, D> makeInstance() { + protected KMeansMacQueen<V> makeInstance() { return new KMeansMacQueen<>(distanceFunction, k, maxiter, initializer); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index 0a97c4d3..49b67599 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -1,147 +1,148 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2013 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -import java.util.ArrayList; -import java.util.List; - -import de.lmu.ifi.dbs.elki.data.Cluster; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; -import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.documentation.Title; - -/** - * Provides the k-medians clustering algorithm, using Lloyd-style bulk - * iterations. - * - * Reference: - * <p> - * Clustering via Concave Minimization<br /> - * P. S. Bradley, O. L. Mangasarian, W. N. Street<br /> - * in: Advances in neural information processing systems - * </p> - * - * @author Erich Schubert - * - * @param <V> vector datatype - * @param <D> distance value type - */ -@Title("K-Medians") -@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url = "http://nips.djvuzone.org/djvu/nips09/0368.djvu") -public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, MeanModel<V>> { - /** - * The logger for this class. - */ - private static final Logging LOG = Logging.getLogger(KMediansLloyd.class); - - /** - * Constructor. - * - * @param distanceFunction distance function - * @param k k parameter - * @param maxiter Maxiter parameter - * @param initializer Initialization method - */ - public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { - super(distanceFunction, k, maxiter, initializer); - } - - @Override - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { - return new Clustering<>("k-Medians Clustering", "kmedians-clustering"); - } - // Choose initial medians - List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); - // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); - } - WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); - - IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null; - for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (prog != null) { - prog.incrementProcessed(LOG); - } - boolean changed = assignToNearestCluster(relation, medians, clusters, assignment); - // Stop if no cluster assignment changed. - if (!changed) { - break; - } - // Recompute medians. - medians = medians(clusters, medians, relation); - } - if (prog != null) { - prog.setCompleted(LOG); - } - // Wrap result - final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<MeanModel<V>> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering"); - for (int i = 0; i < clusters.size(); i++) { - MeanModel<V> model = new MeanModel<>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); - result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); - } - return result; - } - - @Override - protected Logging getLogger() { - return LOG; - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { - @Override - protected Logging getLogger() { - return LOG; - } - - @Override - protected KMediansLloyd<V, D> makeInstance() { - return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer); - } - } -} +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+
+/**
+ * k-medians clustering algorithm, but using Lloyd-style bulk iterations instead
+ * of the more complicated approach suggested by Kaufman and Rousseeuw (see
+ * {@link KMedoidsPAM} instead).
+ *
+ * Reference:
+ * <p>
+ * Clustering via Concave Minimization<br />
+ * P. S. Bradley, O. L. Mangasarian, W. N. Street<br />
+ * in: Advances in Neural Information Processing Systems (NIPS)
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector datatype
+ */
+@Title("K-Medians")
+@Reference(title = "Clustering via Concave Minimization", //
+authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", //
+booktitle = "Advances in Neural Information Processing Systems", //
+url = "https://papers.nips.cc/paper/1260-clustering-via-concave-minimization.pdf")
+public class KMediansLloyd<V extends NumberVector> extends AbstractKMeans<V, MeanModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMediansLloyd.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ */
+ public KMediansLloyd(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ @Override
+ public Clustering<MeanModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("k-Medians Clustering", "kmedians-clustering");
+ }
+ // Choose initial medians
+ List<Vector> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY);
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+ double[] distsum = new double[k];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ boolean changed = assignToNearestCluster(relation, medians, clusters, assignment, distsum);
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute medians.
+ medians = medians(clusters, medians, relation);
+ }
+ LOG.setCompleted(prog);
+ // Wrap result
+ Clustering<MeanModel> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MeanModel model = new MeanModel(medians.get(i));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected KMediansLloyd<V> makeInstance() {
+ return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index 41cca225..d2f98466 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,8 +27,9 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.FarthestPointsInitialMeans; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.model.MedoidModel; @@ -43,8 +44,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; @@ -54,8 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * Provides the k-medoids clustering algorithm, using a "bulk" variation of the - * "Partitioning Around Medoids" approach. + * A k-medoids clustering algorithm, implemented as EM-style bulk algorithm. * * In contrast to PAM, which will in each iteration update one medoid with one * (arbitrary) non-medoid, this implementation follows the EM pattern. In the @@ -72,9 +71,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @apiviz.composedOf KMedoidsInitialization * * @param <V> vector datatype - * @param <D> distance value type */ -public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { +public class KMedoidsEM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { /** * The logger for this class. */ @@ -103,7 +101,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista * @param maxiter Maxiter parameter * @param initializer Function to generate the initial means */ - public KMedoidsEM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + public KMedoidsEM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { super(distanceFunction); this.k = k; this.maxiter = maxiter; @@ -121,9 +119,9 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } - DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction()); // Choose initial medoids - ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for(int i = 0; i < k; i++) { @@ -139,9 +137,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista // Swap phase boolean changed = true; while(changed) { - if(prog != null) { - prog.incrementProcessed(LOG); - } + LOG.incrementProcessed(prog); changed = false; // Try to swap the medoid with a better cluster member: int i = 0; @@ -154,7 +150,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista } Mean mdist = new Mean(); for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { - mdist.put(distQ.distance(iter, iter2).doubleValue()); + mdist.put(distQ.distance(iter, iter2)); } if(mdist.getMean() < bestm.getMean()) { best = DBIDUtil.deref(iter); @@ -172,9 +168,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista assignToNearestCluster(medoids, mdists, clusters, distQ); } } - if(prog != null) { - prog.setCompleted(LOG); - } + LOG.setCompleted(prog); // Wrap result Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); @@ -195,7 +189,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista * @param distQ distance query * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) { boolean changed = false; double[] dists = new double[k]; @@ -205,7 +199,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista { int i = 0; for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { - dists[i] = distQ.distance(iditer, miter).doubleValue(); + dists[i] = distQ.distance(iditer, miter); if(dists[i] < mindist) { minIndex = i; mindist = dists[i]; @@ -247,7 +241,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista * * @apiviz.exclude */ - public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> { protected int k; protected int maxiter; @@ -263,7 +257,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista k = kP.intValue(); } - ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, FarthestPointsInitialMeans.class); if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } @@ -276,7 +270,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista } @Override - protected KMedoidsEM<V, D> makeInstance() { + protected KMedoidsEM<V> makeInstance() { return new KMedoidsEM<>(distanceFunction, k, maxiter, initializer); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index c9e1dc47..30bb56e5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,8 +27,9 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.model.MedoidModel; @@ -40,15 +41,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -59,14 +60,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * Provides the k-medoids clustering algorithm, using the - * "Partitioning Around Medoids" approach. + * The original PAM algorithm or k-medoids clustering, as proposed by Kaufman + * and Rousseeuw in "Partitioning Around Medoids". * * Reference: * <p> * Clustering my means of Medoids<br /> * Kaufman, L. and Rousseeuw, P.J.<br /> - * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + * in: Statistical Data Analysis Based on the L1-Norm and Related Methods * </p> * * @author Erich Schubert @@ -75,23 +76,24 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @apiviz.composedOf KMedoidsInitialization * * @param <V> vector datatype - * @param <D> distance value type */ @Title("Partioning Around Medoids") -@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") -public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { +@Reference(title = "Clustering by means of Medoids", // +authors = "Kaufman, L. and Rousseeuw, P.J.", // +booktitle = "Statistical Data Analysis Based on the L1-Norm and Related Methods") +public class KMedoidsPAM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class); /** - * Holds the value of {@link AbstractKMeans#K_ID}. + * The number of clusters to produce. */ protected int k; /** - * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + * The maximum number of iterations. */ protected int maxiter; @@ -108,7 +110,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @param maxiter Maxiter parameter * @param initializer Function to generate the initial means */ - public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + public KMedoidsPAM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { super(distanceFunction); this.k = k; this.maxiter = maxiter; @@ -126,16 +128,36 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } - DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction()); DBIDs ids = relation.getDBIDs(); // Choose initial medoids - ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } + runPAMOptimization(distQ, ids, medoids, clusters); + + // Wrap result + Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + /** + * Run the PAM optimization phase. + * + * @param distQ Distance query + * @param ids IDs to process + * @param medoids Medoids list + * @param clusters Clusters + */ + protected void runPAMOptimization(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, List<ModifiableDBIDs> clusters) { WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); // Initial assignment to nearest medoids // TODO: reuse this information, from the build phase, when possible? @@ -143,15 +165,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase + DBIDVar bestid = DBIDUtil.newVar(); boolean changed = true; while(changed) { - if(prog != null) { - prog.incrementProcessed(LOG); - } + LOG.incrementProcessed(prog); changed = false; // Try to swap the medoid with a better cluster member: double best = 0; - DBID bestid = null; int bestcluster = -1; int i = 0; for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { @@ -159,30 +179,28 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist if(DBIDUtil.equal(miter, iter)) { continue; } - // double disti = distQ.distance(id, med).doubleValue(); double cost = 0; - DBIDIter olditer = medoids.iter(); - for(int j = 0; j < k; j++, olditer.advance()) { + DBIDIter miter2 = medoids.iter(); + for(int j = 0; j < k; j++, miter2.advance()) { for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { - double distcur = distQ.distance(iter2, olditer).doubleValue(); - double distnew = distQ.distance(iter2, iter).doubleValue(); + if(DBIDUtil.equal(miter2, iter2)) { + continue; + } + double distcur = distQ.distance(iter2, miter2); + double distnew = distQ.distance(iter2, iter); if(j == i) { // Cases 1 and 2. double distsec = second.doubleValue(iter2); - if(distcur > distsec) { - // Case 1, other would switch to a third medoid - cost += distsec - distcur; // Always positive! - } - else { // Would remain with the candidate - cost += distnew - distcur; // Could be negative - } + cost += (distcur > distsec) ? // + // Case 1, other would switch to a third medoid + distsec - distcur // Always positive! + : // Would remain with the candidate + distnew - distcur; // Could be negative } else { // Cases 3-4: objects from other clusters - if(distcur < distnew) { - // Case 3: no change - } - else { + // Case 3: is no change + if(distcur > distnew) { // Case 4: would switch to new medoid cost += distnew - distcur; // Always negative } @@ -191,18 +209,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } if(cost < best) { best = cost; - bestid = DBIDUtil.deref(iter); + bestid.set(iter); bestcluster = i; } } } - if(prog != null) { - prog.setCompleted(LOG); - } if(LOG.isDebugging()) { LOG.debug("Best cost: " + best); } - if(bestid != null) { + if(best < 0.) { changed = true; medoids.set(bestcluster, bestid); } @@ -212,14 +227,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist assignToNearestCluster(medoids, ids, second, clusters, distQ); } } - - // Wrap result - Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); - for(int i = 0; i < clusters.size(); i++) { - MedoidModel model = new MedoidModel(medoids.get(i)); - result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); - } - return result; + LOG.setCompleted(prog); } /** @@ -233,36 +241,32 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @param distQ distance query * @return true when any object was reassigned */ - protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) { boolean changed = false; + DBIDArrayIter miter = means.iter(); for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY, mindist2 = Double.POSITIVE_INFINITY; int minIndex = 0; - double mindist = Double.POSITIVE_INFINITY; - double mindist2 = Double.POSITIVE_INFINITY; - { - int i = 0; - for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { - double dist = distQ.distance(iditer, miter).doubleValue(); - if(dist < mindist) { - minIndex = i; - mindist2 = mindist; - mindist = dist; - } - else if(dist < mindist2) { - mindist2 = dist; - } + miter.seek(0); // Reuse iterator. + for(int i = 0; miter.valid(); miter.advance(), i++) { + double dist = distQ.distance(iditer, miter); + if(dist < mindist) { + minIndex = i; + mindist2 = mindist; + mindist = dist; + } + else if(dist < mindist2) { + mindist2 = dist; } } if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { - break; - } + for(int j = 0; j < k; j++) { + if(j != minIndex && clusters.get(j).remove(iditer)) { + break; } } } @@ -288,18 +292,27 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * * @apiviz.exclude */ - public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> { + /** + * The number of clusters to produce. + */ protected int k; + /** + * The maximum number of iterations. + */ protected int maxiter; + /** + * Method to choose initial means. + */ protected KMedoidsInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + IntParameter kP = new IntParameter(KMeans.K_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(kP)) { k = kP.intValue(); } @@ -309,15 +322,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); - maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } @Override - protected KMedoidsPAM<V, D> makeInstance() { + protected KMedoidsPAM<V> makeInstance() { return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java new file mode 100644 index 00000000..d63d52d4 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SingleAssignmentKMeans.java @@ -0,0 +1,130 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; + +/** + * Pseudo-k-Means variations, that assigns each object to the nearest center. + * + * @author Erich Schubert + * + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + */ +public class SingleAssignmentKMeans<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(SingleAssignmentKMeans.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param initializer Initialization method + */ + public SingleAssignmentKMeans(PrimitiveDistanceFunction<NumberVector> distanceFunction, int k, KMeansInitialization<? super V> initializer) { + super(distanceFunction, k, -1, initializer); + } + + @Override + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<>("k-Means Assignment", "kmeans-assignment"); + } + // Choose initial means + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; + + assignToNearestCluster(relation, means, clusters, assignment, varsum); + + // Wrap result + Clustering<KMeansModel> result = new Clustering<>("Nearest Centroid Clustering", "nearest-center-clustering"); + for(int i = 0; i < clusters.size(); i++) { + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected void makeOptions(Parameterization config) { + // Do NOT invoke super.makeOptions, as we don't want to have the maxiter + // parameter, nor a warning for other distance functions. + getParameterDistanceFunction(config); + getParameterK(config); + getParameterInitialization(config); + } + + @Override + protected SingleAssignmentKMeans<V> makeInstance() { + return new SingleAssignmentKMeans<>(distanceFunction, k, initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java new file mode 100644 index 00000000..4e31a6ba --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java @@ -0,0 +1,393 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PredefinedInitialMeans; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality.KMeansQualityMeasure; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ProxyDatabase; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualGlobalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * X-means: Extending K-means with Efficient Estimation on the Number of + * Clusters. + * + * Note: this implementation does currently <em>not</em> use a k-d-tree for + * acceleration. Also note that k_max is not a hard threshold - the algorithm + * can return up to 2*k_max clusters! + * + * Reference:<br> + * <p> + * D. Pelleg, A. Moore:<br /> + * X-means: Extending K-means with Efficient Estimation on the Number of + * Clusters<br /> + * In: Proceedings of the 17th International Conference on Machine Learning + * (ICML 2000) + * </p> + * + * @author Tibor Goldschwendt + * @author Erich Schubert + * + * @param <V> Vector type + * @param <M> Model type + */ +@Reference(authors = "D. Pelleg, A. Moore", // +booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", // +title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", // +url = "http://www.pelleg.org/shared/hp/download/xmeans.ps") +public class XMeans<V extends NumberVector, M extends MeanModel> extends AbstractKMeans<V, M> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(XMeans.class); + + /** + * Inner k-means algorithm. + */ + private KMeans<V, M> innerKMeans; + + /** + * Effective number of clusters, minimum and maximum. + */ + private int k, k_min, k_max; + + /** + * Initializer for k-means. + */ + PredefinedInitialMeans splitInitializer; + + /** + * Information criterion to choose the better split. + */ + KMeansQualityMeasure<V> informationCriterion; + + /** + * Random factory. + */ + RandomFactory rnd; + + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param k_min k_min parameter - minimum number of result clusters + * @param k_max k_max parameter - maximum number of result clusters + * @param maxiter Maximum number of iterations each. + * @param innerKMeans K-Means variant to use inside. + * @param informationCriterion The information criterion used for the + * splitting step + * @param random Random factory + */ + public XMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k_min, int k_max, int maxiter, KMeans<V, M> innerKMeans, KMeansInitialization<? super V> initializer, PredefinedInitialMeans splitInitializer, KMeansQualityMeasure<V> informationCriterion, RandomFactory random) { + super(distanceFunction, k_min, maxiter, initializer); + this.k_min = k_min; + this.k_max = k_max; + this.k = k_min; + this.innerKMeans = innerKMeans; + this.splitInitializer = splitInitializer; + this.informationCriterion = informationCriterion; + this.rnd = random; + } + + /** + * Run the algorithm on a database and relation. + * + * @param database Database to process + * @param relation Data relation + * @return Clustering result. + */ + @Override + public Clustering<M> run(Database database, Relation<V> relation) { + MutableProgress prog = LOG.isVerbose() ? new MutableProgress("X-means number of clusters", k_max, LOG) : null; + + // Run initial k-means to find at least k_min clusters + innerKMeans.setK(k_min); + splitInitializer.setInitialMeans(initializer.chooseInitialMeans(database, relation, k_min, getDistanceFunction(), Vector.FACTORY)); + Clustering<M> clustering = innerKMeans.run(database, relation); + + if(prog != null) { + prog.setProcessed(k_min, LOG); + } + + ArrayList<Cluster<M>> clusters = new ArrayList<>(clustering.getAllClusters()); + while(clusters.size() <= k_max) { + // Improve-Structure: + ArrayList<Cluster<M>> nextClusters = new ArrayList<>(); + for(Cluster<M> cluster : clusters) { + // Try to split this cluster: + List<Cluster<M>> childClusterList = splitCluster(cluster, database, relation); + nextClusters.addAll(childClusterList); + if(childClusterList.size() > 1) { + k += childClusterList.size() - 1; + if(prog != null) { + if(k >= k_max) { + prog.setTotal(k + 1); + } + prog.setProcessed(k, LOG); + } + } + } + if(clusters.size() == nextClusters.size()) { + break; + } + // Improve-Params: + splitInitializer.setInitialClusters(nextClusters); + innerKMeans.setK(nextClusters.size()); + clustering = innerKMeans.run(database, relation); + clusters.clear(); + clusters.addAll(clustering.getAllClusters()); + } + + // Ensure that the progress bar finished. + if(prog != null) { + prog.setTotal(k); + prog.setProcessed(k, LOG); + } + + if(LOG.isDebugging()) { + LOG.debug("X-means returned k=" + k + " clusters."); + } + + // add all current clusters to the result + Clustering<M> result = new Clustering<>("X-Means Result", "X-Means", clusters); + return result; + } + + /** + * Conditionally splits the clusters based on the information criterion. + * + * @param parentCluster Cluster to split + * @param database Database + * @param relation Data relation + * @return Parent cluster when split decreases clustering quality or child + * clusters when split improves clustering. + */ + protected List<Cluster<M>> splitCluster(Cluster<M> parentCluster, Database database, Relation<V> relation) { + // Transform parent cluster into a clustering + ArrayList<Cluster<M>> parentClusterList = new ArrayList<Cluster<M>>(1); + parentClusterList.add(parentCluster); + Clustering<M> parentClustering = new Clustering<>(parentCluster.getName(), parentCluster.getName(), parentClusterList); + + if(parentCluster.size() < 2) { + // Split is not possbile + return parentClusterList; + } + + ProxyDatabase proxyDB = new ProxyDatabase(parentCluster.getIDs(), database); + + splitInitializer.setInitialMeans(splitCentroid(parentCluster, relation)); + innerKMeans.setK(2); + Clustering<M> childClustering = innerKMeans.run(proxyDB); + + double parentEvaluation = informationCriterion.quality(parentClustering, getDistanceFunction(), relation); + double childrenEvaluation = informationCriterion.quality(childClustering, getDistanceFunction(), relation); + + if(LOG.isDebugging()) { + LOG.debug("parentEvaluation: " + parentEvaluation); + LOG.debug("childrenEvaluation: " + childrenEvaluation); + } + + // Check if split is an improvement: + return (childrenEvaluation > parentEvaluation) ^ informationCriterion.ascending() ? parentClusterList : childClustering.getAllClusters(); + } + + /** + * Split an existing centroid into two initial centers. + * + * @param parentCluster Existing cluster + * @param relation Data relation + * @return List of new centroids + */ + protected List<? extends NumberVector> splitCentroid(Cluster<? extends MeanModel> parentCluster, Relation<V> relation) { + Vector parentCentroid = parentCluster.getModel().getMean(); + + // Compute size of cluster/region + double radius = 0.; + for(DBIDIter it = parentCluster.getIDs().iter(); it.valid(); it.advance()) { + double d = getDistanceFunction().distance(relation.get(it), parentCentroid); + radius = (d > radius) ? d : radius; + } + + // Choose random vector + Random random = rnd.getSingleThreadedRandom(); + final int dim = RelationUtil.dimensionality(relation); + Vector randomVector = VectorUtil.randomVector(Vector.FACTORY, dim, random).normalize(); + randomVector.timesEquals((.4 + random.nextDouble() * .5) * radius); + + // Get the new centroids + ArrayList<Vector> vecs = new ArrayList<>(2); + vecs.add(parentCentroid.minus(randomVector)); + vecs.add(randomVector.plusEquals(parentCentroid)); + return vecs; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return innerKMeans.getInputTypeRestriction(); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Tibor Goldschwendt + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <M> Model type of inner algorithm + */ + public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractKMeans.Parameterizer<V> { + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID INNER_KMEANS_ID = new OptionID("xmeans.kmeans", "kMeans algorithm to use."); + + /** + * Minimum number of clusters. + */ + public static final OptionID K_MIN_ID = new OptionID("xmeans.k_min", "The minimum number of clusters to find."); + + /** + * Randomization seed. + */ + public static final OptionID SEED_ID = new OptionID("xmeans.seed", "Random seed for splitting clusters."); + + /** + * Quality measure to use for evaluating splits. + */ + public static final OptionID INFORMATION_CRITERION_ID = new OptionID("xmeans.quality", "The quality measure to evaluate splits (e.g. AIC, BIC)"); + + /** + * Variant of kMeans + */ + protected KMeans<V, M> innerKMeans; + + /** + * Class to feed splits to the internal k-means algorithm. + */ + protected PredefinedInitialMeans splitInitializer; + + /** + * Information criterion. + */ + protected KMeansQualityMeasure<V> informationCriterion; + + /** + * Minimum and maximum number of result clusters. + */ + protected int k_min, k_max; + + /** + * Random number generator. + */ + private RandomFactory random; + + @Override + protected void makeOptions(Parameterization config) { + // Do NOT invoke super.makeOptions to hide the "k" parameter. + IntParameter kMinP = new IntParameter(K_MIN_ID, 2) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kMinP)) { + k_min = kMinP.intValue(); + } + IntParameter kMaxP = new IntParameter(KMeans.K_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kMaxP)) { + k_max = kMaxP.intValue(); + } + // We allow k_min = k_max. + config.checkConstraint(new LessEqualGlobalConstraint<>(kMinP, kMaxP)); + getParameterInitialization(config); + getParameterMaxIter(config); + getParameterDistanceFunction(config); + + RandomParameter rndP = new RandomParameter(SEED_ID); + if(config.grab(rndP)) { + random = rndP.getValue(); + } + splitInitializer = new PredefinedInitialMeans((List<Vector>) null); + + ObjectParameter<KMeans<V, M>> innerKMeansP = new ObjectParameter<>(INNER_KMEANS_ID, KMeans.class, KMeansLloyd.class); + if(config.grab(innerKMeansP)) { + ListParameterization initialKMeansVariantParameters = new ListParameterization(); + initialKMeansVariantParameters.addParameter(KMeans.K_ID, k_min); + initialKMeansVariantParameters.addParameter(KMeans.INIT_ID, splitInitializer); + initialKMeansVariantParameters.addParameter(KMeans.MAXITER_ID, maxiter); + initialKMeansVariantParameters.addParameter(KMeans.DISTANCE_FUNCTION_ID, distanceFunction); + ChainedParameterization combinedConfig = new ChainedParameterization(initialKMeansVariantParameters, config); + combinedConfig.errorsTo(config); + innerKMeans = innerKMeansP.instantiateClass(combinedConfig); + } + + ObjectParameter<KMeansQualityMeasure<V>> informationCriterionP = new ObjectParameter<>(INFORMATION_CRITERION_ID, KMeansQualityMeasure.class); + if(config.grab(informationCriterionP)) { + informationCriterion = informationCriterionP.instantiateClass(config); + } + } + + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected XMeans<V, M> makeInstance() { + return new XMeans<V, M>(distanceFunction, k_min, k_max, maxiter, innerKMeans, initializer, splitInitializer, informationCriterion, random); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java index 9e3eb478..43c59c65 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/AbstractKMeansInitialization.java @@ -1,10 +1,10 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,7 +22,9 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -34,7 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * * @param <V> Vector type */ -public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> { +public abstract class AbstractKMeansInitialization<V extends NumberVector> implements KMeansInitialization<V> { /** * Random number generator */ @@ -56,7 +58,7 @@ public abstract class AbstractKMeansInitialization<V> implements KMeansInitializ * * @apiviz.exclude */ - public abstract static class Parameterizer<V> extends AbstractParameterizer { + public abstract static class Parameterizer extends AbstractParameterizer { /** * Random generator */ diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java index 9edfd816..599c3975 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestPointsInitialMeans.java @@ -1,183 +1,199 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2013 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -import java.util.ArrayList; -import java.util.List; - -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; -import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; - -/** - * K-Means initialization by repeatedly choosing the farthest point. - * - * Note: this is less random than other initializations, so running multiple - * times will be more likely to return the same local minima. - * - * @author Erich Schubert - * - * @param <V> Vector type - * @param <D> Distance type - */ -public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { - /** - * Discard the first vector. - */ - boolean dropfirst = true; - - /** - * Constructor. - * - * @param rnd Random generator. - * @param dropfirst Flag to discard the first vector. - */ - public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) { - super(rnd); - this.dropfirst = dropfirst; - } - - @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { - // Get a distance query - if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); - } - @SuppressWarnings("unchecked") - final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); - - // Chose first mean - List<V> means = new ArrayList<>(k); - - DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter(); - means.add(relation.get(first)); - - DBIDVar best = DBIDUtil.newVar(first); - for(int i = (dropfirst ? 0 : 1); i < k; i++) { - // Find farthest object: - double maxdist = Double.NEGATIVE_INFINITY; - for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { - double dsum = 0.; - for(V ex : means) { - dsum += distQ.distance(ex, it).doubleValue(); - } - if(dsum > maxdist) { - maxdist = dsum; - best.set(it); - } - } - // Add new mean: - if(k == 0) { - means.clear(); // Remove temporary first element. - } - means.add(relation.get(best)); - } - - return means; - } - - @Override - public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { - if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); - } - @SuppressWarnings("unchecked") - DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; - final Relation<?> relation = distQ.getRelation(); - // Chose first mean - ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - - DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter(); - means.add(first); - - DBIDVar best = DBIDUtil.newVar(first); - for(int i = (dropfirst ? 0 : 1); i < k; i++) { - // Find farthest object: - double maxdist = Double.NEGATIVE_INFINITY; - for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { - double dsum = 0.; - for(DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { - dsum += distQ.distance(ex, it).doubleValue(); - } - if(dsum > maxdist) { - maxdist = dsum; - best.set(it); - } - } - // Add new mean: - if(k == 0) { - means.clear(); // Remove temporary first element. - } - means.add(best); - } - - return means; - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { - /** - * Option ID to control the handling of the first object chosen. - */ - public static final OptionID DROPFIRST_ID = new OptionID("farthest.dropfirst", "Drop the first object chosen (which is chosen randomly) for the farthest points heuristic."); - - /** - * Flag for discarding the first object chosen. - */ - protected boolean dropfirst = true; - - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - Flag dropfirstP = new Flag(DROPFIRST_ID); - if(config.grab(dropfirstP)) { - dropfirst = dropfirstP.isTrue(); - } - } - - @Override - protected FarthestPointsInitialMeans<V, D> makeInstance() { - return new FarthestPointsInitialMeans<>(rnd, dropfirst); - } - } -} +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+
+/**
+ * K-Means initialization by repeatedly choosing the farthest point (by the
+ * <em>minimum</em> distance to earlier points).
+ *
+ * Note: this is less random than other initializations, so running multiple
+ * times will be more likely to return the same local minima.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type for kMedoids and kMedians
+ */
+public class FarthestPointsInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
+ /**
+ * Discard the first vector.
+ */
+ boolean dropfirst = true;
+
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ * @param dropfirst Flag to discard the first vector.
+ */
+ public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) {
+ super(rnd);
+ this.dropfirst = dropfirst;
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ // Get a distance query
+ DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction);
+
+ DBIDs ids = relation.getDBIDs();
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.POSITIVE_INFINITY);
+
+ // Chose first mean
+ List<V> means = new ArrayList<>(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ T prevmean = relation.get(first);
+ means.add(factory.newNumberVector(prevmean));
+
+ // Find farthest object each.
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double val = Math.min(prev, distQ.distance(prevmean, it));
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, val);
+ }
+ if(val > maxdist) {
+ maxdist = val;
+ best.set(it);
+ }
+ }
+ // Add new mean (and drop the initial mean when desired)
+ if(i == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(factory.newNumberVector(prevmean));
+ }
+
+ // Explicitly destroy temporary data.
+ store.destroy();
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
+ @SuppressWarnings("unchecked")
+ final Relation<O> relation = (Relation<O>) distQ.getRelation();
+
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.POSITIVE_INFINITY);
+
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ means.add(first);
+ O prevmean = relation.get(first);
+
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ // Find farthest object:
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double val = Math.min(prev, distQ.distance(prevmean, it));
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, val);
+ }
+ if(val > maxdist) {
+ maxdist = val;
+ best.set(it);
+ }
+ }
+ // Add new mean:
+ if(i == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(best);
+ }
+
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O> extends AbstractKMeansInitialization.Parameterizer {
+ /**
+ * Option ID to control the handling of the first object chosen.
+ */
+ public static final OptionID KEEPFIRST_ID = new OptionID("farthest.keepfirst", "Keep the first object chosen (which is chosen randomly) for the farthest points heuristic.");
+
+ /**
+ * Flag for discarding the first object chosen.
+ */
+ protected boolean keepfirst = false;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ Flag dropfirstP = new Flag(KEEPFIRST_ID);
+ if(config.grab(dropfirstP)) {
+ keepfirst = dropfirstP.isTrue();
+ }
+ }
+
+ @Override
+ protected FarthestPointsInitialMeans<O> makeInstance() {
+ return new FarthestPointsInitialMeans<>(rnd, !keepfirst);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java new file mode 100644 index 00000000..c6b02e67 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FarthestSumPointsInitialMeans.java @@ -0,0 +1,176 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+
+/**
+ * K-Means initialization by repeatedly choosing the farthest point (by the
+ * <em>sum</em> of distances to previous objects).
+ *
+ * Note: this is less random than other initializations, so running multiple
+ * times will be more likely to return the same local minima.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type for kmedoids and kmedians
+ */
+public class FarthestSumPointsInitialMeans<O> extends FarthestPointsInitialMeans<O> {
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ * @param dropfirst Flag to discard the first vector.
+ */
+ public FarthestSumPointsInitialMeans(RandomFactory rnd, boolean dropfirst) {
+ super(rnd, dropfirst);
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ // Get a distance query
+ DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction);
+
+ DBIDs ids = relation.getDBIDs();
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
+ // Chose first mean
+ List<V> means = new ArrayList<>(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ T prevmean = relation.get(first);
+ means.add(factory.newNumberVector(prevmean));
+
+ // Find farthest object each.
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double dsum = prev + distQ.distance(prevmean, it);
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, dsum);
+ }
+ if(dsum > maxdist) {
+ maxdist = dsum;
+ best.set(it);
+ }
+ }
+ // Add new mean (and drop the initial mean when desired)
+ if(i == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(factory.newNumberVector(prevmean));
+ }
+
+ // Explicitly destroy temporary data.
+ store.destroy();
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
+ @SuppressWarnings("unchecked")
+ final Relation<O> relation = (Relation<O>) distQ.getRelation();
+
+ WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ DBIDRef first = DBIDUtil.randomSample(ids, 1, rnd).iter();
+ means.add(first);
+ O prevmean = relation.get(first);
+
+ DBIDVar best = DBIDUtil.newVar(first);
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
+ // Find farthest object:
+ double maxdist = Double.NEGATIVE_INFINITY;
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ final double prev = store.doubleValue(it);
+ if(prev != prev) {
+ continue; // NaN: already chosen!
+ }
+ double dsum = prev + distQ.distance(prevmean, it);
+ // Don't store distance to first mean, when it will be dropped below.
+ if(i > 0) {
+ store.putDouble(it, dsum);
+ }
+ if(dsum > maxdist) {
+ maxdist = dsum;
+ best.set(it);
+ }
+ }
+ // Add new mean:
+ if(k == 0) {
+ means.clear(); // Remove temporary first element.
+ }
+ store.putDouble(best, Double.NaN); // So it won't be chosen twice.
+ prevmean = relation.get(best);
+ means.add(best);
+ }
+
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V> extends FarthestPointsInitialMeans.Parameterizer<V> {
+ /**
+ * Flag for discarding the first object chosen.
+ */
+ protected boolean keepfirst = false;
+
+ @Override
+ protected FarthestSumPointsInitialMeans<V> makeInstance() {
+ return new FarthestSumPointsInitialMeans<>(rnd, !keepfirst);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java index 08e2f116..af5a7717 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/FirstKInitialMeans.java @@ -1,87 +1,87 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2013 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -import java.util.ArrayList; -import java.util.List; - -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; - -/** - * Initialize K-means by using the first k objects as initial means. - * - * @author Erich Schubert - * - * @param <V> Vector type - */ -public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsInitialization<V> { - /** - * Constructor. - */ - public FirstKInitialMeans() { - super(); - } - - @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { - DBIDIter iter = relation.iterDBIDs(); - List<V> means = new ArrayList<>(k); - for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { - means.add(relation.get(iter)); - } - return means; - } - - @Override - public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { - DBIDIter iter = distanceFunction.getRelation().iterDBIDs(); - ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { - means.add(iter); - } - return means; - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { - @Override - protected FirstKInitialMeans<V> makeInstance() { - return new FirstKInitialMeans<>(); - } - } +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Initialize K-means by using the first k objects as initial means.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type for KMedoids
+ */
+public class FirstKInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> {
+ /**
+ * Constructor.
+ */
+ public FirstKInitialMeans() {
+ super();
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ DBIDIter iter = relation.iterDBIDs();
+ List<V> means = new ArrayList<>(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(factory.newNumberVector(relation.get(iter)));
+ }
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distanceFunction) {
+ DBIDIter iter = ids.iter();
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(iter);
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ @Override
+ protected FirstKInitialMeans<V> makeInstance() {
+ return new FirstKInitialMeans<>();
+ }
+ }
}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java index 06fb10c1..e87b9d14 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansInitialization.java @@ -1,10 +1,10 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -36,9 +36,9 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; * * @apiviz.landmark * - * @param <V> Object type + * @param <V> Vector type */ -public interface KMeansInitialization<V> { +public interface KMeansInitialization<V extends NumberVector> { /** * Choose initial means * @@ -46,8 +46,10 @@ public interface KMeansInitialization<V> { * @param relation Relation * @param k Parameter k * @param distanceFunction Distance function - * + * @param factory Factory for output vectors. + * @param <T> Input vector type + * @param <O> Output vector type * @return List of chosen means for k-means */ - public abstract List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction); + public abstract <T extends V, O extends NumberVector> List<O> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<O> factory); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java index 6fc514eb..977a4182 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMeansPlusPlusInitialMeans.java @@ -1,10 +1,10 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -28,21 +28,21 @@ import java.util.Random; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; /** * K-Means++ initialization for k-means. @@ -57,11 +57,10 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; * * @author Erich Schubert * - * @param <V> Vector type - * @param <D> Distance type + * @param <O> Vector type */ @Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494") -public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { +public class KMeansPlusPlusInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> { /** * Constructor. * @@ -72,25 +71,20 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten } @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { - // Get a distance query - if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("K-Means++ initialization can only be used with numerical distances."); - } - @SuppressWarnings("unchecked") - final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); + public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) { + DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction); + + DBIDs ids = relation.getDBIDs(); + WritableDoubleDataStore weights = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.); // Chose first mean List<V> means = new ArrayList<>(k); Random random = rnd.getSingleThreadedRandom(); - DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, random).iter()); - means.add(relation.get(first)); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(ids, 1, random).iter()); + means.add(factory.newNumberVector(relation.get(first))); - ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); // Initialize weights - double[] weights = new double[ids.size()]; double weightsum = initialWeights(weights, ids, first, distQ); while(means.size() < k) { if(weightsum > Double.MAX_VALUE) { @@ -100,47 +94,43 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); } double r = random.nextDouble() * weightsum; - int pos = 0; - while(r > 0 && pos < weights.length - 1) { - r -= weights[pos]; - pos++; + DBIDIter it = ids.iter(); + for(; r > 0. && it.valid(); it.advance()) { + double w = weights.doubleValue(it); + if(w != w) { + continue; // NaN: alrady chosen. + } + r -= w; } // Add new mean: - DBID newmean = ids.get(pos); - means.add(relation.get(newmean)); + final T newmean = relation.get(it); + means.add(factory.newNumberVector(newmean)); // Update weights: - weights[pos] = 0.0; + weights.putDouble(it, Double.NaN); // Choose optimized version for double distances, if applicable. - if(distF instanceof PrimitiveDoubleDistanceFunction) { - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF; - weightsum = updateWeights(weights, ids, newmean, ddist, relation); - } - else { - weightsum = updateWeights(weights, ids, newmean, distQ); - } + weightsum = updateWeights(weights, ids, newmean, distQ); } + // Explicitly destroy temporary data. + weights.destroy(); + return means; } @Override - public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { - if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("K-Means++ initialization initialization can only be used with numerical distances."); - } + public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) { @SuppressWarnings("unchecked") - DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; - // Chose first mean + final Relation<O> rel = (Relation<O>) distQ.getRelation(); + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + WritableDoubleDataStore weights = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.); + Random random = rnd.getSingleThreadedRandom(); - DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random).iter()); + DBIDRef first = DBIDUtil.randomSample(ids, 1, random).iter(); means.add(first); - ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); // Initialize weights - double[] weights = new double[ids.size()]; double weightsum = initialWeights(weights, ids, first, distQ); while(means.size() < k) { if(weightsum > Double.MAX_VALUE) { @@ -150,17 +140,19 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); } double r = random.nextDouble() * weightsum; - int pos = 0; - while(r > 0 && pos < weights.length) { - r -= weights[pos]; - pos++; + DBIDIter it = ids.iter(); + for(; r > 0. && it.valid(); it.advance()) { + double w = weights.doubleValue(it); + if(w != w) { + continue; // NaN: alrady chosen. + } + r -= w; } // Add new mean: - DBID newmean = ids.get(pos); - means.add(newmean); + means.add(it); // Update weights: - weights[pos] = 0.0; - weightsum = updateWeights(weights, ids, newmean, distQ); + weights.putDouble(it, Double.NaN); + weightsum = updateWeights(weights, ids, rel.get(it), distQ); } return means; @@ -175,18 +167,13 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten * @param distQ Distance query * @return Weight sum */ - protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) { - double weightsum = 0.0; - DBIDIter it = ids.iter(); - for(int i = 0; i < weights.length; i++, it.advance()) { - if(DBIDUtil.equal(latest, it)) { - weights[i] = 0.0; - } - else { - double d = distQ.distance(latest, it).doubleValue(); - weights[i] = d * d; - } - weightsum += weights[i]; + protected double initialWeights(WritableDoubleDataStore weights, DBIDs ids, DBIDRef latest, DistanceQuery<?> distQ) { + double weightsum = 0.; + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { + // Distance will usually already be squared + double weight = distQ.distance(latest, it); + weights.putDouble(it, weight); + weightsum += weight; } return weightsum; } @@ -200,38 +187,19 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten * @param distQ Distance query * @return Weight sum */ - protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) { - double weightsum = 0.0; - DBIDIter it = ids.iter(); - for(int i = 0; i < weights.length; i++, it.advance()) { - if(weights[i] > 0.0) { - double d = distQ.distance(latest, it).doubleValue(); - weights[i] = Math.min(weights[i], d * d); - weightsum += weights[i]; + protected <T> double updateWeights(WritableDoubleDataStore weights, DBIDs ids, T latest, DistanceQuery<? super T> distQ) { + double weightsum = 0.; + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { + double weight = weights.doubleValue(it); + if(weight != weight) { + continue; // NaN: already chosen! } - } - return weightsum; - } - - /** - * Update the weight list. - * - * @param weights Weight list - * @param ids IDs - * @param latest Added ID - * @param distF Distance function - * @return Weight sum - */ - protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, PrimitiveDoubleDistanceFunction<V> distF, Relation<V> rel) { - final V lv = rel.get(latest); - double weightsum = 0.0; - DBIDIter it = ids.iter(); - for(int i = 0; i < weights.length; i++, it.advance()) { - if(weights[i] > 0.0) { - double d = distF.doubleDistance(lv, rel.get(it)); - weights[i] = Math.min(weights[i], d * d); - weightsum += weights[i]; + double newweight = distQ.distance(latest, it); + if(newweight < weight) { + weights.putDouble(it, newweight); + weight = newweight; } + weightsum += weight; } return weightsum; } @@ -243,9 +211,9 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten * * @apiviz.exclude */ - public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer { @Override - protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { + protected KMeansPlusPlusInitialMeans<V> makeInstance() { return new KMeansPlusPlusInitialMeans<>(rnd); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java index 136a4129..9ae59961 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/KMedoidsInitialization.java @@ -1,9 +1,9 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -38,8 +38,9 @@ public interface KMedoidsInitialization<V> { * Choose initial means * * @param k Parameter k + * @param ids Candidate IDs. * @param distanceFunction Distance function * @return List of chosen means for k-means */ - public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction); + public abstract DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super V> distanceFunction); }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java index c7e1751f..812ea4ab 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PAMInitialMeans.java @@ -1,10 +1,10 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -38,7 +38,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -58,11 +57,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; * * @author Erich Schubert * - * @param <V> Vector type - * @param <D> Distance type + * @param <O> Object type for KMedoids initialization */ -@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") -public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMeansInitialization<V>, KMedoidsInitialization<V> { +@Reference(title = "Clustering my means of Medoids", // +authors = "Kaufman, L. and Rousseeuw, P.J.", // +booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class PAMInitialMeans<O> implements KMeansInitialization<NumberVector>, KMedoidsInitialization<O> { /** * Constructor. */ @@ -71,31 +71,24 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean } @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) { + // Ugly cast; but better than code duplication. + @SuppressWarnings("unchecked") + Relation<O> rel = (Relation<O>) relation; // Get a distance query - if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("PAM initialization can only be used with numerical distances."); - } @SuppressWarnings("unchecked") - final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - final DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); - DBIDs medids = chooseInitialMedoids(k, distQ); + final PrimitiveDistanceFunction<? super O> distF = (PrimitiveDistanceFunction<? super O>) distanceFunction; + final DistanceQuery<O> distQ = database.getDistanceQuery(rel, distF); + DBIDs medids = chooseInitialMedoids(k, rel.getDBIDs(), distQ); List<V> medoids = new ArrayList<>(k); for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) { - medoids.add(relation.get(iter)); + medoids.add(factory.newNumberVector(relation.get(iter))); } return medoids; } @Override - public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { - if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("PAM initialization can only be used with numerical distances."); - } - @SuppressWarnings("unchecked") - DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; - final DBIDs ids = distQ.getRelation().getDBIDs(); - + public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) { ArrayModifiableDBIDs medids = DBIDUtil.newArray(k); double best = Double.POSITIVE_INFINITY; Mean mean = new Mean(); // Mean is numerically more stable than sum. @@ -109,7 +102,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); mean.reset(); for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - double d = distQ.distance(iter, iter2).doubleValue(); + double d = distQ.distance(iter, iter2); mean.put(d); newd.putDouble(iter2, d); } @@ -141,7 +134,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); mean.reset(); for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - double dn = distQ.distance(iter, iter2).doubleValue(); + double dn = distQ.distance(iter, iter2); double v = Math.min(dn, mindist.doubleValue(iter2)); mean.put(v); newd.put(iter2, v); @@ -178,9 +171,9 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean * * @apiviz.exclude */ - public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + public static class Parameterizer<V> extends AbstractParameterizer { @Override - protected PAMInitialMeans<V, D> makeInstance() { + protected PAMInitialMeans<V> makeInstance() { return new PAMInitialMeans<>(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java new file mode 100644 index 00000000..01df56ae --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/PredefinedInitialMeans.java @@ -0,0 +1,161 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.NumberVector.Factory; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.VectorListParameter; + +/** + * Run k-means with prespecified initial means. + * + * @author Erich Schubert + */ +public class PredefinedInitialMeans extends AbstractKMeansInitialization<NumberVector> { + /** + * Initial means to return. + */ + List<? extends NumberVector> initialMeans; + + /** + * Constructor. + * + * @param initialMeans Initial means + */ + public PredefinedInitialMeans(List<? extends NumberVector> initialMeans) { + super(null); + this.setInitialMeans(initialMeans); + } + + /** + * Constructor. + * + * @param initialMeans Initial means + */ + public PredefinedInitialMeans(double[][] initialMeans) { + super(null); + this.setInitialMeans(initialMeans); + } + + /** + * Set the initial means. + * + * Important notice: Use with care - the means are <em>not copied</em>! + * + * @param initialMeans initial means. + */ + public void setInitialMeans(List<? extends NumberVector> initialMeans) { + this.initialMeans = initialMeans; + } + + /** + * Set the initial means. + * + * Important notice: Use with care - the means are <em>not copied</em>! + * + * @param initialMeans initial means. + */ + public void setInitialClusters(List<? extends Cluster<? extends MeanModel>> initialMeans) { + List<Vector> vecs = new ArrayList<>(initialMeans.size()); + for(Cluster<? extends MeanModel> cluster : initialMeans) { + vecs.add(cluster.getModel().getMean().copy()); + } + this.initialMeans = vecs; + } + + /** + * Set the initial means. + * + * Important notice: Use with care - the means are <em>not copied</em>! + * + * @param initialMeans initial means. + */ + public void setInitialMeans(double[][] initialMeans) { + List<Vector> vecs = new ArrayList<>(initialMeans.length); + for(int i = 0; i < initialMeans.length; ++i) { + vecs.add(new Vector(initialMeans[i])); + } + this.initialMeans = vecs; + } + + @Override + public <T extends NumberVector, O extends NumberVector> List<O> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, Factory<O> factory) { + if(k != initialMeans.size()) { + throw new AbortException("Predefined initial means contained " + initialMeans.size() + " means, algorithm requested " + k + " means instead."); + } + // Chose first mean + List<O> means = new ArrayList<>(k); + + for(NumberVector v : initialMeans) { + means.add(factory.newNumberVector(v)); + } + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Option to specify the initial means to use. + */ + public static final OptionID INITIAL_MEANS = new OptionID("kmeans.means", "Initial means for k-means."); + + /** + * Initial means. + */ + protected List<Vector> initialMeans; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + VectorListParameter meansP = new VectorListParameter(INITIAL_MEANS); + if(config.grab(meansP)) { + initialMeans = meansP.getValue(); + } + } + + @Override + protected PredefinedInitialMeans makeInstance() { + return new PredefinedInitialMeans(initialMeans); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java index 214f4ce6..fe6b7281 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyChosenInitialMeans.java @@ -1,84 +1,84 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2013 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -import java.util.ArrayList; -import java.util.List; - -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; - -/** - * Initialize K-means by randomly choosing k exsiting elements as cluster - * centers. - * - * @author Erich Schubert - * - * @param <V> Vector type - */ -public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { - /** - * Constructor. - * - * @param rnd Random generator. - */ - public RandomlyChosenInitialMeans(RandomFactory rnd) { - super(rnd); - } - - @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { - DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd); - List<V> means = new ArrayList<>(k); - for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - means.add(relation.get(iter)); - } - return means; - } - - @Override - public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { - return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, rnd); - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> { - @Override - protected RandomlyChosenInitialMeans<V> makeInstance() { - return new RandomlyChosenInitialMeans<>(rnd); - } - } +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+
+/**
+ * Initialize K-means by randomly choosing k exsiting elements as cluster
+ * centers.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Vector type
+ */
+public class RandomlyChosenInitialMeans<O> extends AbstractKMeansInitialization<NumberVector> implements KMedoidsInitialization<O> {
+ /**
+ * Constructor.
+ *
+ * @param rnd Random generator.
+ */
+ public RandomlyChosenInitialMeans(RandomFactory rnd) {
+ super(rnd);
+ }
+
+ @Override
+ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) {
+ DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd);
+ List<V> means = new ArrayList<>(k);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ means.add(factory.newNumberVector(relation.get(iter)));
+ }
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distanceFunction) {
+ return DBIDUtil.randomSample(ids, k, rnd);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer {
+ @Override
+ protected RandomlyChosenInitialMeans<V> makeInstance() {
+ return new RandomlyChosenInitialMeans<>(rnd);
+ }
+ }
}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java index 1329132e..4f7a21a0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/RandomlyGeneratedInitialMeans.java @@ -1,10 +1,10 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,19 +32,15 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.math.MathUtil; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; /** * Initialize k-means by generating random vectors (within the data sets value * range). * * @author Erich Schubert - * - * @param <V> Vector type */ -public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends AbstractKMeansInitialization<V> { +public class RandomlyGeneratedInitialMeans extends AbstractKMeansInitialization<NumberVector> { /** * Constructor. * @@ -55,17 +51,20 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab } @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<V> factory) { final int dim = RelationUtil.dimensionality(relation); - NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); + double[][] minmax = RelationUtil.computeMinMax(relation); + double[] min = minmax[0], scale = minmax[1]; + for(int d = 0; d < dim; d++) { + scale[d] = scale[d] - min[d]; + } List<V> means = new ArrayList<>(k); final Random random = rnd.getSingleThreadedRandom(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); // Rescale for(int d = 0; d < dim; d++) { - r[d] = minmax.first.doubleValue(d) + (minmax.second.doubleValue(d) - minmax.first.doubleValue(d)) * r[d]; + r[d] = min[d] + scale[d] * r[d]; } means.add(factory.newNumberVector(r)); } @@ -79,10 +78,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer extends AbstractKMeansInitialization.Parameterizer { @Override - protected RandomlyGeneratedInitialMeans<V> makeInstance() { - return new RandomlyGeneratedInitialMeans<>(rnd); + protected RandomlyGeneratedInitialMeans makeInstance() { + return new RandomlyGeneratedInitialMeans(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java index 79013364..bd1bc5a8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/SampleKMeansInitialization.java @@ -1,10 +1,10 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,10 +26,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.model.ModelUtil; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -38,8 +40,8 @@ import de.lmu.ifi.dbs.elki.database.relation.ProxyView; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.logging.LoggingUtil; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; @@ -54,11 +56,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @param <V> Vector type */ -public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization<V> { +public class SampleKMeansInitialization<V extends NumberVector> extends AbstractKMeansInitialization<V> { /** - * Variant of kMeans for the bisecting step. + * Variant of kMeans to use for initialization. */ - private KMeans<V, D, ?> innerkMeans; + private KMeans<V, ?> innerkMeans; /** * Sample size. @@ -72,28 +74,36 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis * @param innerkMeans Inner k-means algorithm. * @param rate Sampling rate. */ - public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, D, ?> innerkMeans, double rate) { + public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, ?> innerkMeans, double rate) { super(rnd); this.innerkMeans = innerkMeans; this.rate = rate; } @Override - public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + public <T extends V, O extends NumberVector> List<O> chooseInitialMeans(Database database, Relation<T> relation, int k, PrimitiveDistanceFunction<? super T> distanceFunction, NumberVector.Factory<O> factory) { final int samplesize = (int) Math.ceil(rate * relation.size()); final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), samplesize, rnd); - ProxyView<V> proxyv = new ProxyView<>(database, sample, relation); + // Ugly cast, sorry + @SuppressWarnings("unchecked") + Relation<V> rel = (Relation<V>) relation; + // FIXME: This does not necessarily hold. Check and fail! + if(!distanceFunction.getInputTypeRestriction().isAssignableFromType(TypeUtil.NUMBER_VECTOR_FIELD)) { + LoggingUtil.warning("Initializing k-means with k-means using specialized distance functions MAY fail, if the initialization method does require a distance defined on arbitrary number vectors."); + } + @SuppressWarnings("unchecked") + PrimitiveDistanceFunction<? super NumberVector> pdf = (PrimitiveDistanceFunction<? super NumberVector>) distanceFunction; + ProxyView<V> proxyv = new ProxyView<>(database, sample, rel); ProxyDatabase proxydb = new ProxyDatabase(sample, proxyv); innerkMeans.setK(k); - @SuppressWarnings("unchecked") - PrimitiveDistanceFunction<? super NumberVector<?>, D> df = (PrimitiveDistanceFunction<? super NumberVector<?>, D>) distanceFunction; - innerkMeans.setDistanceFunction(df); - Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv); - List<V> means = new ArrayList<>(); - for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) { - means.add(cluster.getModel().getMean()); + innerkMeans.setDistanceFunction(pdf); + Clustering<?> clusters = innerkMeans.run(proxydb, proxyv); + + List<O> means = new ArrayList<>(); + for(Cluster<?> cluster : clusters.getAllClusters()) { + means.add(factory.newNumberVector(ModelUtil.getPrototype(cluster.getModel(), relation))); } return means; @@ -107,9 +117,8 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis * @apiviz.exclude * * @param <V> Vector type - * @param <D> Distance type */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V extends NumberVector> extends AbstractKMeansInitialization.Parameterizer { /** * Parameter to specify the kMeans variant. */ @@ -123,8 +132,8 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis /** * Inner k-means algorithm to use. */ - protected KMeans<V, D, ?> innerkMeans; - + protected KMeans<V, ?> innerkMeans; + /** * Sampling rate. */ @@ -133,8 +142,8 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<KMeans<V, D, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); - if (config.grab(kMeansVariantP)) { + ObjectParameter<KMeans<V, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); + if(config.grab(kMeansVariantP)) { ListParameterization kMeansVariantParameters = new ListParameterization(); // We will always invoke this with k as requested from outside! @@ -145,15 +154,15 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis combinedConfig.errorsTo(config); innerkMeans = kMeansVariantP.instantiateClass(combinedConfig); } - + DoubleParameter sampleP = new DoubleParameter(SAMPLE_ID); - if (config.grab(sampleP)) { + if(config.grab(sampleP)) { rate = sampleP.doubleValue(); } } @Override - protected SampleKMeansInitialization<V, D> makeInstance() { + protected SampleKMeansInitialization<V> makeInstance() { return new SampleKMeansInitialization<>(rnd, innerkMeans, rate); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java new file mode 100644 index 00000000..14af1d69 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/initialization/package-info.java @@ -0,0 +1,27 @@ +/** + * Initialization strategies for k-means. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java index aa4c3e24..da9bd20e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java new file mode 100644 index 00000000..38133c6b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/KMeansProcessor.java @@ -0,0 +1,272 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.parallel; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.parallel.Executor; +import de.lmu.ifi.dbs.elki.parallel.processor.Processor; + +/** + * Parallel k-means implementation. + * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +public class KMeansProcessor<V extends NumberVector> implements Processor { + /** + * Data relation. + */ + Relation<V> relation; + + /** + * Distance function. + */ + PrimitiveDistanceFunction<? super NumberVector> distance; + + /** + * Assignment storage. + */ + WritableIntegerDataStore assignment; + + /** + * Mean vectors. + */ + List<Vector> means; + + /** + * Updated cluster centroids + */ + double[][] centroids; + + /** + * (Partial) cluster sizes + */ + int[] sizes; + + /** + * Variance sum. + */ + double[] varsum; + + /** + * Whether the assignment changed during the last iteration. + */ + boolean changed = false; + + /** + * Constructor. + * + * @param relation Data relation + * @param distance Distance function + * @param assignment Cluster assignment + * @param varsum Variance sums + */ + public KMeansProcessor(Relation<V> relation, PrimitiveDistanceFunction<? super NumberVector> distance, WritableIntegerDataStore assignment, double[] varsum) { + super(); + this.distance = distance; + this.relation = relation; + this.assignment = assignment; + this.varsum = varsum; + } + + /** + * Get the "has changed" value. + * + * @return Changed flag. + */ + public boolean changed() { + return changed; + } + + /** + * Initialize for a new iteration. + * + * @param means New means. + */ + public void nextIteration(List<Vector> means) { + this.means = means; + changed = false; + final int k = means.size(); + final int dim = means.get(0).getDimensionality(); + centroids = new double[k][dim]; + sizes = new int[k]; + Arrays.fill(varsum, 0.); + } + + @Override + public Instance instantiate(Executor exectutor) { + return new Instance(relation, distance, assignment, means); + } + + @Override + public void cleanup(Processor.Instance inst) { + @SuppressWarnings("unchecked") + Instance instance = (Instance) inst; + synchronized(this) { + changed |= instance.changed; + for(int i = 0; i < centroids.length; i++) { + int sizeb = instance.sizes[i]; + if(sizeb == 0) { + continue; + } + int sizea = sizes[i]; + double sum = sizea + sizeb; + double[] cent = centroids[i]; + if(sizea > 0) { + VMath.timesEquals(cent, sizea / sum); + } + VMath.plusTimesEquals(cent, instance.centroids[i], 1. / sum); + sizes[i] += sizeb; + VMath.plusEquals(varsum, instance.varsum); + } + } + } + + /** + * Get the new means. + * + * @return New means + */ + public List<Vector> getMeans() { + ArrayList<Vector> newmeans = new ArrayList<>(centroids.length); + for(int i = 0; i < centroids.length; i++) { + if(sizes[i] == 0) { + newmeans.add(means.get(i)); // Keep old mean. + continue; + } + newmeans.add(new Vector(centroids[i])); + } + return newmeans; + } + + /** + * Instance to process part of the data set, for a single iteration. + * + * @author Erich Schubert + */ + public class Instance implements Processor.Instance { + /** + * Data relation. + */ + private Relation<V> relation; + + /** + * Distance function. + */ + private PrimitiveDistanceFunction<? super NumberVector> distance; + + /** + * Cluster assignment storage. + */ + private WritableIntegerDataStore assignment; + + /** + * Current mean vectors. + */ + private Vector[] means; + + /** + * Updated cluster centroids + */ + private double[][] centroids; + + /** + * (Partial) cluster sizes + */ + private int[] sizes; + + /** + * Variance sum. + */ + private double[] varsum; + + /** + * Changed flag. + */ + private boolean changed = false; + + /** + * Constructor. + * + * @param relation Data relation + * @param distance Distance function + * @param assignment Current assignment + * @param means Previous mean vectors + */ + public Instance(Relation<V> relation, PrimitiveDistanceFunction<? super NumberVector> distance, WritableIntegerDataStore assignment, List<? extends NumberVector> means) { + super(); + this.relation = relation; + this.distance = distance; + this.assignment = assignment; + final int k = means.size(); + this.means = new Vector[k]; + Iterator<? extends NumberVector> iter = means.iterator(); + for(int i = 0; i < k; i++) { + this.means[i] = iter.next().getColumnVector(); // Make local copy! + } + // Storage for updated means. + final int dim = this.means[0].getDimensionality(); + this.centroids = new double[k][dim]; + this.sizes = new int[k]; + this.varsum = new double[k]; + } + + @Override + public void map(DBIDRef id) { + final V fv = relation.get(id); + // Find minimum: + double mindist = Double.POSITIVE_INFINITY; + int minIndex = 0; + for(int i = 0; i < means.length; i++) { + final double dist = distance.distance(fv, means[i]); + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + varsum[minIndex] += mindist; + // Update assignment: + int prev = assignment.putInt(id, minIndex); + // Update changed flag: + changed |= (prev != minIndex); + double[] cent = centroids[minIndex]; + for(int d = 0; d < fv.getDimensionality(); d++) { + // TODO: improve numerical stability via Kahan summation? + cent[d] += fv.doubleValue(d); + } + ++sizes[minIndex]; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java new file mode 100644 index 00000000..35262616 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/ParallelLloydKMeans.java @@ -0,0 +1,154 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.parallel; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.AbstractKMeans; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.parallel.ParallelExecutor; + +/** + * Parallel implementation of k-Means clustering. + * + * @author Erich Schubert + * + * @apiviz.has KMeansProcessor + * + * @param <V> Vector type + */ +public class ParallelLloydKMeans<V extends NumberVector> extends AbstractKMeans<V, KMeansModel> { + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param k K parameter + */ + public ParallelLloydKMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Class logger + */ + private static final Logging LOG = Logging.getLogger(ParallelLloydKMeans.class); + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + public Clustering<KMeansModel> run(Database database, Relation<V> relation) { + DBIDs ids = relation.getDBIDs(); + + // Choose initial means + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); + + // Store for current cluster assignment. + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + double[] varsum = new double[k]; + KMeansProcessor<V> kmm = new KMeansProcessor<>(relation, distanceFunction, assignment, varsum); + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + LOG.incrementProcessed(prog); + kmm.nextIteration(means); + ParallelExecutor.run(ids, kmm); + // Stop if no cluster assignment changed. + if (!kmm.changed()) { + break; + } + means = kmm.getMeans(); + } + LOG.setCompleted(prog); + + // Wrap result + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for (int i = 0; i < k; i++) { + // TODO: use cluster sizes from kmm! + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + clusters.get(assignment.intValue(iter)).add(iter); + } + + Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { + DBIDs cids = clusters.get(i); + if (cids.size() == 0) { + continue; + } + KMeansModel model = new KMeansModel(means.get(i), varsum[i]); + result.addToplevelCluster(new Cluster<>(cids, model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector> extends AbstractKMeans.Parameterizer<V> { + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected ParallelLloydKMeans<V> makeInstance() { + return new ParallelLloydKMeans<>(distanceFunction, k, maxiter, initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java new file mode 100644 index 00000000..89941cc3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/parallel/package-info.java @@ -0,0 +1,27 @@ +/** + * Parallelized implementations of k-means. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.parallel;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java new file mode 100644 index 00000000..88279023 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AbstractKMeansQualityMeasure.java @@ -0,0 +1,239 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/** + * Base class for evaluating clusterings by information criteria (such as AIC or + * BIC). Provides helper functions (e.g. max likelihood calculation) to its + * subclasses. + * + * The use of information-theoretic criteria for evaluating k-means was + * popularized by X-means: + * <p> + * D. Pelleg, A. Moore:<br /> + * X-means: Extending K-means with Efficient Estimation on the Number of + * Clusters<br /> + * In: Proceedings of the 17th International Conference on Machine Learning + * (ICML 2000) + * </p> + * + * A different version of logLikelihood is derived in: + * <p> + * Q. Zhao, M. Xu, P. Fränti:<br /> + * Knee Point Detection on Bayesian Information Criterion<br /> + * 20th IEEE International Conference on Tools with Artificial Intelligence + * </p> + * + * @author Tibor Goldschwendt + * @author Erich Schubert + */ +@Reference(authors = "D. Pelleg, A. Moore", // +booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", // +title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", // +url = "http://www.pelleg.org/shared/hp/download/xmeans.ps") +public abstract class AbstractKMeansQualityMeasure<O extends NumberVector> implements KMeansQualityMeasure<O> { + /** + * Compute the number of points in a given set of clusters (which may be + * <i>less</i> than the complete data set for X-means!) + * + * @param clustering Clustering to analyze + * @return Number of points + */ + public static int numPoints(Clustering<? extends MeanModel> clustering) { + int n = 0; + for(Cluster<? extends MeanModel> aCluster : clustering.getAllClusters()) { + n += aCluster.size(); + } + return n; + } + + /** + * Variance contribution of a single cluster. + * + * If possible, this information is reused from the clustering process (when a + * KMeansModel is returned). + * + * @param cluster Cluster to access + * @param distanceFunction Distance function + * @param relation Data relation + * @return Cluster variance + */ + public static double varianceOfCluster(Cluster<? extends MeanModel> cluster, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<? extends NumberVector> relation) { + MeanModel model = cluster.getModel(); + if(model instanceof KMeansModel) { + return ((KMeansModel) model).getVarianceContribution(); + } + // Re-compute: + DBIDs ids = cluster.getIDs(); + Vector mean = model.getMean(); + + boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction); + double variance = 0.; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double dist = distanceFunction.distance(relation.get(iter), mean); + variance += squared ? dist : dist * dist; + } + return variance; + } + + /** + * Computes log likelihood of an entire clustering. + * + * Version as used in the X-means publication. + * + * @param relation Data relation + * @param clustering Clustering + * @param distanceFunction Distance function + * @return Log Likelihood. + */ + @Reference(authors = "D. Pelleg, A. Moore", // + booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", // + title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", // + url = "http://www.pelleg.org/shared/hp/download/xmeans.ps") + public static double logLikelihood(Relation<? extends NumberVector> relation, Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction) { + List<? extends Cluster<? extends MeanModel>> clusters = clustering.getAllClusters(); + // dimensionality of data points + final int dim = RelationUtil.dimensionality(relation); + // number of clusters + final int m = clusters.size(); + + // number of objects in the clustering + int n = 0; + // cluster sizes + int[] n_i = new int[m]; + // total variance + double d = 0.; + // variances + double[] d_i = new double[m]; + + // Iterate over clusters: + Iterator<? extends Cluster<? extends MeanModel>> it = clusters.iterator(); + for(int i = 0; it.hasNext(); ++i) { + Cluster<? extends MeanModel> cluster = it.next(); + n += n_i[i] = cluster.size(); + d += d_i[i] = varianceOfCluster(cluster, distanceFunction, relation); + } + + // Total variance (corrected for bias) + final double v = d / (n - m), logv = Math.log(v); + // log likelihood of this clustering + double logLikelihood = 0.; + + // Aggregate + for(int i = 0; i < m; i++) { + logLikelihood += n_i[i] * Math.log(n_i[i]) // Posterior entropy, Rn log Rn + - n_i[i] * .5 * MathUtil.LOGTWOPI // Rn/2 log2pi + - n_i[i] * dim * .5 * logv // Rn M/2 log sigma^2 + - (d_i[i] - m) * .5; // (Rn-K)/2 + } + logLikelihood -= n * Math.log(n); // Prior entropy, sum_i Rn log R + + return logLikelihood; + } + + /** + * Computes log likelihood of an entire clustering. + * + * Version as used by Zhao et al. + * + * @param relation Data relation + * @param clustering Clustering + * @param distanceFunction Distance function + * @return Log Likelihood. + */ + @Reference(authors = "Q. Zhao, M. Xu, P. Fränti", // + title = "Knee Point Detection on Bayesian Information Criterion", // + booktitle = "20th IEEE International Conference on Tools with Artificial Intelligence", // + url = "http://dx.doi.org/10.1109/ICTAI.2008.154") + public static double logLikelihoodAlternate(Relation<? extends NumberVector> relation, Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction) { + List<? extends Cluster<? extends MeanModel>> clusters = clustering.getAllClusters(); + // dimensionality of data points + final int dim = RelationUtil.dimensionality(relation); + // number of clusters + final int m = clusters.size(); + + // number of objects in the clustering + int n = 0; + // cluster sizes + int[] n_i = new int[m]; + // variances + double[] d_i = new double[m]; + + // Iterate over clusters: + Iterator<? extends Cluster<? extends MeanModel>> it = clusters.iterator(); + for(int i = 0; it.hasNext(); ++i) { + Cluster<? extends MeanModel> cluster = it.next(); + n += n_i[i] = cluster.size(); + d_i[i] = varianceOfCluster(cluster, distanceFunction, relation); + } + + // log likelihood of this clustering + double logLikelihood = 0.; + + // Aggregate + for(int i = 0; i < m; i++) { + logLikelihood += n_i[i] * Math.log(n_i[i] / (double) n) // ni log ni/n + - n_i[i] * dim * .5 * MathUtil.LOGTWOPI // ni*d/2 log2pi + - n_i[i] * .5 * Math.log(d_i[i]) // ni/2 log sigma_i + - (n_i[i] - m) * .5; // (ni-m)/2 + } + return logLikelihood; + } + + /** + * Compute the number of free parameters. + * + * @param relation Data relation (for dimensionality) + * @param clustering Set of clusters + * @return Number of free parameters + */ + public static int numberOfFreeParameters(Relation<? extends NumberVector> relation, Clustering<? extends MeanModel> clustering) { + // number of clusters + int m = clustering.getAllClusters().size(); // num_ctrs + + // dimensionality of data points + int dim = RelationUtil.dimensionality(relation); // num_dims + + // number of free parameters + return (m - 1) + m * dim + m; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java new file mode 100644 index 00000000..f1619386 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/AkaikeInformationCriterion.java @@ -0,0 +1,74 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/** + * Akaike Information Criterion (AIC). + * + * Reference: + * <p> + * H. Akaike<br /> + * On entropy maximization principle<br /> + * Application of statistics, 1977, North-Holland + * </p> + * + * The use for k-means was popularized by: + * <p> + * D. Pelleg, A. Moore:<br /> + * X-means: Extending K-means with Efficient Estimation on the Number of + * Clusters<br /> + * In: Proceedings of the 17th International Conference on Machine Learning + * (ICML 2000) + * </p> + * + * @author Tibor Goldschwendt + * @author Erich Schubert + */ +@Reference(authors = "H. Akaike", // +title = "On entropy maximization principle", // +booktitle = "Application of statistics, 1977, North-Holland") +public class AkaikeInformationCriterion extends AbstractKMeansQualityMeasure<NumberVector> { + @Override + public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) { + return logLikelihood(relation, clustering, distanceFunction) - numberOfFreeParameters(relation, clustering); + } + + @Override + public boolean ascending() { + return true; + } + + @Override + public boolean isBetter(double currentCost, double bestCost) { + // Careful: bestCost may be NaN! + return !(currentCost <= bestCost); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java new file mode 100644 index 00000000..73b0c501 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterion.java @@ -0,0 +1,77 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/** + * Bayesian Information Criterion (BIC), also known as Schwarz criterion (SBC, + * SBIC) for the use with evaluating k-means results. + * + * Reference: + * <p> + * G. Schwarz<br /> + * Estimating the dimension of a model<br /> + * The annals of statistics 6.2. + * </p> + * + * The use for k-means was popularized by: + * <p> + * D. Pelleg, A. Moore:<br /> + * X-means: Extending K-means with Efficient Estimation on the Number of + * Clusters<br /> + * In: Proceedings of the 17th International Conference on Machine Learning + * (ICML 2000) + * </p> + * + * @author Tibor Goldschwendt + * @author Erich Schubert + */ +@Reference(authors = "G. Schwarz", // +title = "Estimating the dimension of a model", // +booktitle = "The annals of statistics 6.2", // +url = "http://dx.doi.org/10.1214/aos/1176344136") +public class BayesianInformationCriterion extends AbstractKMeansQualityMeasure<NumberVector> { + @Override + public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) { + return logLikelihood(relation, clustering, distanceFunction) // + - (.5 * numberOfFreeParameters(relation, clustering)) * Math.log(numPoints(clustering)); + } + + @Override + public boolean ascending() { + return true; + } + + @Override + public boolean isBetter(double currentCost, double bestCost) { + // Careful: bestCost may be NaN! + return !(currentCost <= bestCost); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java new file mode 100644 index 00000000..3fa646c2 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/BayesianInformationCriterionZhao.java @@ -0,0 +1,67 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/** + * Different version of the BIC criterion. + * + * Reference: + * <p> + * Q. Zhao, M. Xu, P. Fränti:<br /> + * Knee Point Detection on Bayesian Information Criterion<br /> + * 20th IEEE International Conference on Tools with Artificial Intelligence + * </p> + * + * @author Tibor Goldschwendt + * @author Erich Schubert + */ +@Reference(authors = "Q. Zhao, M. Xu, P. Fränti", // +title = "Knee Point Detection on Bayesian Information Criterion", // +booktitle = "20th IEEE International Conference on Tools with Artificial Intelligence", // +url = "http://dx.doi.org/10.1109/ICTAI.2008.154") +public class BayesianInformationCriterionZhao extends AbstractKMeansQualityMeasure<NumberVector> { + @Override + public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) { + return logLikelihoodAlternate(relation, clustering, distanceFunction) // + - (.5 * clustering.getAllClusters().size()) * Math.log(numPoints(clustering)); + } + + @Override + public boolean ascending() { + return true; + } + + @Override + public boolean isBetter(double currentCost, double bestCost) { + // Careful: bestCost may be NaN! + return !(currentCost <= bestCost); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java index f2de7846..2a234261 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -28,17 +28,17 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.MeanModel; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; /** * Interface for computing the quality of a K-Means clustering. * + * Important note: some measures are ascending, others are descending! + * * @author Erich Schubert * * @param <O> Input Object restriction type - * @param <D> Distance restriction type */ -public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Distance<?>> { +public interface KMeansQualityMeasure<O extends NumberVector> { /** * Calculates and returns the quality measure. * @@ -50,5 +50,22 @@ public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Dista * * @return quality measure */ - <V extends O> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends D> distanceFunction, Relation<V> relation); + <V extends O> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation); + + /** + * Use ascending or descending ordering. + * + * @return {@code true} when larger scores are better. + */ + boolean ascending(); + + /** + * Compare two scores. + * + * @param currentCost New (candiate) cost/score + * @param bestCost Existing best cost/score (may be {@code NaN}) + * @return {@code true} when the new score is better, or the old score is + * {@code NaN}. + */ + boolean isBetter(double currentCost, double bestCost); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java index e0ddfff0..9083f924 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,8 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List; - import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -32,8 +30,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; /** * Class for computing the average overall distance. @@ -42,48 +38,35 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; * * @author Stephan Baier */ -public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> { +public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector> { @Override - public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) { - @SuppressWarnings("unchecked") - final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters(); - - if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) { - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction; - double clusterDistanceSum = 0; - for (Cluster<MeanModel<V>> cluster : clusterList) { - DBIDs ids = cluster.getIDs(); + public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) { + double clusterDistanceSum = 0; + for(Cluster<? extends MeanModel> cluster : clustering.getAllClusters()) { + DBIDs ids = cluster.getIDs(); - // Compute sum of pairwise distances: - double clusterPairwiseDistanceSum = 0; - for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { - V obj1 = relation.get(iter1); - for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - clusterPairwiseDistanceSum += df.doubleDistance(obj1, relation.get(iter2)); - } + // Compute sum of pairwise distances: + double clusterPairwiseDistanceSum = 0; + for(DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { + V obj1 = relation.get(iter1); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2)); } - clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); } + clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); + } - return clusterDistanceSum / clusterList.size(); - } else { - double clusterDistanceSum = 0; - for (Cluster<MeanModel<V>> cluster : clusterList) { - DBIDs ids = cluster.getIDs(); + return clusterDistanceSum / clustering.getAllClusters().size(); + } - // Compute sum of pairwise distances: - double clusterPairwiseDistanceSum = 0; - for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { - V obj1 = relation.get(iter1); - for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { - clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2)).doubleValue(); - } - } - clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); - } + @Override + public boolean ascending() { + return false; + } - return clusterDistanceSum / clusterList.size(); - } + @Override + public boolean isBetter(double currentCost, double bestCost) { + // Careful: bestCost may be NaN! + return !(currentCost >= bestCost); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java index 32ad5210..513e3ca3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,62 +22,37 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.List; - import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; /** * Class for computing the variance in a clustering result (sum-of-squares). * * @author Stephan Baier + * @author Erich Schubert */ -public class WithinClusterVarianceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> { +public class WithinClusterVarianceQualityMeasure extends AbstractKMeansQualityMeasure<NumberVector> { @Override - public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) { - @SuppressWarnings("unchecked") - final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters(); - - boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction); - if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) { - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction; - double variance = 0.0; - for (Cluster<MeanModel<V>> cluster : clusterList) { - DBIDs ids = cluster.getIDs(); - V mean = cluster.getModel().getMean(); + public <V extends NumberVector> double quality(Clustering<? extends MeanModel> clustering, PrimitiveDistanceFunction<? super NumberVector> distanceFunction, Relation<V> relation) { + double variance = 0.; + for(Cluster<? extends MeanModel> cluster : clustering.getAllClusters()) { + variance += varianceOfCluster(cluster, distanceFunction, relation); + } + return variance; + } - for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - double dist = df.doubleDistance(relation.get(iter), mean); - if (squared) { - variance += dist; - } else { - variance += dist * dist; - } - } - } - return variance; - } else { - double variance = 0.0; - for (Cluster<MeanModel<V>> cluster : clusterList) { - DBIDs ids = cluster.getIDs(); - V mean = cluster.getModel().getMean(); + @Override + public boolean ascending() { + return false; + } - for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - double dist = distanceFunction.distance(relation.get(iter), mean).doubleValue(); - variance += dist * dist; - } - } - return variance; - } + @Override + public boolean isBetter(double currentCost, double bestCost) { + // Careful: bestCost may be NaN! + return !(currentCost >= bestCost); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java index 1be19bd1..767048db 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java @@ -6,7 +6,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |