summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java393
1 files changed, 393 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
new file mode 100644
index 00000000..4e31a6ba
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/XMeans.java
@@ -0,0 +1,393 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PredefinedInitialMeans;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality.KMeansQualityMeasure;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters.
+ *
+ * Note: this implementation does currently <em>not</em> use a k-d-tree for
+ * acceleration. Also note that k_max is not a hard threshold - the algorithm
+ * can return up to 2*k_max clusters!
+ *
+ * Reference:<br>
+ * <p>
+ * D. Pelleg, A. Moore:<br />
+ * X-means: Extending K-means with Efficient Estimation on the Number of
+ * Clusters<br />
+ * In: Proceedings of the 17th International Conference on Machine Learning
+ * (ICML 2000)
+ * </p>
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <M> Model type
+ */
+@Reference(authors = "D. Pelleg, A. Moore", //
+booktitle = "X-means: Extending K-means with Efficient Estimation on the Number of Clusters", //
+title = "Proceedings of the 17th International Conference on Machine Learning (ICML 2000)", //
+url = "http://www.pelleg.org/shared/hp/download/xmeans.ps")
+public class XMeans<V extends NumberVector, M extends MeanModel> extends AbstractKMeans<V, M> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(XMeans.class);
+
+ /**
+ * Inner k-means algorithm.
+ */
+ private KMeans<V, M> innerKMeans;
+
+ /**
+ * Effective number of clusters, minimum and maximum.
+ */
+ private int k, k_min, k_max;
+
+ /**
+ * Initializer for k-means.
+ */
+ PredefinedInitialMeans splitInitializer;
+
+ /**
+ * Information criterion to choose the better split.
+ */
+ KMeansQualityMeasure<V> informationCriterion;
+
+ /**
+ * Random factory.
+ */
+ RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function
+ * @param k_min k_min parameter - minimum number of result clusters
+ * @param k_max k_max parameter - maximum number of result clusters
+ * @param maxiter Maximum number of iterations each.
+ * @param innerKMeans K-Means variant to use inside.
+ * @param informationCriterion The information criterion used for the
+ * splitting step
+ * @param random Random factory
+ */
+ public XMeans(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k_min, int k_max, int maxiter, KMeans<V, M> innerKMeans, KMeansInitialization<? super V> initializer, PredefinedInitialMeans splitInitializer, KMeansQualityMeasure<V> informationCriterion, RandomFactory random) {
+ super(distanceFunction, k_min, maxiter, initializer);
+ this.k_min = k_min;
+ this.k_max = k_max;
+ this.k = k_min;
+ this.innerKMeans = innerKMeans;
+ this.splitInitializer = splitInitializer;
+ this.informationCriterion = informationCriterion;
+ this.rnd = random;
+ }
+
+ /**
+ * Run the algorithm on a database and relation.
+ *
+ * @param database Database to process
+ * @param relation Data relation
+ * @return Clustering result.
+ */
+ @Override
+ public Clustering<M> run(Database database, Relation<V> relation) {
+ MutableProgress prog = LOG.isVerbose() ? new MutableProgress("X-means number of clusters", k_max, LOG) : null;
+
+ // Run initial k-means to find at least k_min clusters
+ innerKMeans.setK(k_min);
+ splitInitializer.setInitialMeans(initializer.chooseInitialMeans(database, relation, k_min, getDistanceFunction(), Vector.FACTORY));
+ Clustering<M> clustering = innerKMeans.run(database, relation);
+
+ if(prog != null) {
+ prog.setProcessed(k_min, LOG);
+ }
+
+ ArrayList<Cluster<M>> clusters = new ArrayList<>(clustering.getAllClusters());
+ while(clusters.size() <= k_max) {
+ // Improve-Structure:
+ ArrayList<Cluster<M>> nextClusters = new ArrayList<>();
+ for(Cluster<M> cluster : clusters) {
+ // Try to split this cluster:
+ List<Cluster<M>> childClusterList = splitCluster(cluster, database, relation);
+ nextClusters.addAll(childClusterList);
+ if(childClusterList.size() > 1) {
+ k += childClusterList.size() - 1;
+ if(prog != null) {
+ if(k >= k_max) {
+ prog.setTotal(k + 1);
+ }
+ prog.setProcessed(k, LOG);
+ }
+ }
+ }
+ if(clusters.size() == nextClusters.size()) {
+ break;
+ }
+ // Improve-Params:
+ splitInitializer.setInitialClusters(nextClusters);
+ innerKMeans.setK(nextClusters.size());
+ clustering = innerKMeans.run(database, relation);
+ clusters.clear();
+ clusters.addAll(clustering.getAllClusters());
+ }
+
+ // Ensure that the progress bar finished.
+ if(prog != null) {
+ prog.setTotal(k);
+ prog.setProcessed(k, LOG);
+ }
+
+ if(LOG.isDebugging()) {
+ LOG.debug("X-means returned k=" + k + " clusters.");
+ }
+
+ // add all current clusters to the result
+ Clustering<M> result = new Clustering<>("X-Means Result", "X-Means", clusters);
+ return result;
+ }
+
+ /**
+ * Conditionally splits the clusters based on the information criterion.
+ *
+ * @param parentCluster Cluster to split
+ * @param database Database
+ * @param relation Data relation
+ * @return Parent cluster when split decreases clustering quality or child
+ * clusters when split improves clustering.
+ */
+ protected List<Cluster<M>> splitCluster(Cluster<M> parentCluster, Database database, Relation<V> relation) {
+ // Transform parent cluster into a clustering
+ ArrayList<Cluster<M>> parentClusterList = new ArrayList<Cluster<M>>(1);
+ parentClusterList.add(parentCluster);
+ Clustering<M> parentClustering = new Clustering<>(parentCluster.getName(), parentCluster.getName(), parentClusterList);
+
+ if(parentCluster.size() < 2) {
+ // Split is not possbile
+ return parentClusterList;
+ }
+
+ ProxyDatabase proxyDB = new ProxyDatabase(parentCluster.getIDs(), database);
+
+ splitInitializer.setInitialMeans(splitCentroid(parentCluster, relation));
+ innerKMeans.setK(2);
+ Clustering<M> childClustering = innerKMeans.run(proxyDB);
+
+ double parentEvaluation = informationCriterion.quality(parentClustering, getDistanceFunction(), relation);
+ double childrenEvaluation = informationCriterion.quality(childClustering, getDistanceFunction(), relation);
+
+ if(LOG.isDebugging()) {
+ LOG.debug("parentEvaluation: " + parentEvaluation);
+ LOG.debug("childrenEvaluation: " + childrenEvaluation);
+ }
+
+ // Check if split is an improvement:
+ return (childrenEvaluation > parentEvaluation) ^ informationCriterion.ascending() ? parentClusterList : childClustering.getAllClusters();
+ }
+
+ /**
+ * Split an existing centroid into two initial centers.
+ *
+ * @param parentCluster Existing cluster
+ * @param relation Data relation
+ * @return List of new centroids
+ */
+ protected List<? extends NumberVector> splitCentroid(Cluster<? extends MeanModel> parentCluster, Relation<V> relation) {
+ Vector parentCentroid = parentCluster.getModel().getMean();
+
+ // Compute size of cluster/region
+ double radius = 0.;
+ for(DBIDIter it = parentCluster.getIDs().iter(); it.valid(); it.advance()) {
+ double d = getDistanceFunction().distance(relation.get(it), parentCentroid);
+ radius = (d > radius) ? d : radius;
+ }
+
+ // Choose random vector
+ Random random = rnd.getSingleThreadedRandom();
+ final int dim = RelationUtil.dimensionality(relation);
+ Vector randomVector = VectorUtil.randomVector(Vector.FACTORY, dim, random).normalize();
+ randomVector.timesEquals((.4 + random.nextDouble() * .5) * radius);
+
+ // Get the new centroids
+ ArrayList<Vector> vecs = new ArrayList<>(2);
+ vecs.add(parentCentroid.minus(randomVector));
+ vecs.add(randomVector.plusEquals(parentCentroid));
+ return vecs;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return innerKMeans.getInputTypeRestriction();
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Tibor Goldschwendt
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ * @param <M> Model type of inner algorithm
+ */
+ public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractKMeans.Parameterizer<V> {
+ /**
+ * Parameter to specify the kMeans variant.
+ */
+ public static final OptionID INNER_KMEANS_ID = new OptionID("xmeans.kmeans", "kMeans algorithm to use.");
+
+ /**
+ * Minimum number of clusters.
+ */
+ public static final OptionID K_MIN_ID = new OptionID("xmeans.k_min", "The minimum number of clusters to find.");
+
+ /**
+ * Randomization seed.
+ */
+ public static final OptionID SEED_ID = new OptionID("xmeans.seed", "Random seed for splitting clusters.");
+
+ /**
+ * Quality measure to use for evaluating splits.
+ */
+ public static final OptionID INFORMATION_CRITERION_ID = new OptionID("xmeans.quality", "The quality measure to evaluate splits (e.g. AIC, BIC)");
+
+ /**
+ * Variant of kMeans
+ */
+ protected KMeans<V, M> innerKMeans;
+
+ /**
+ * Class to feed splits to the internal k-means algorithm.
+ */
+ protected PredefinedInitialMeans splitInitializer;
+
+ /**
+ * Information criterion.
+ */
+ protected KMeansQualityMeasure<V> informationCriterion;
+
+ /**
+ * Minimum and maximum number of result clusters.
+ */
+ protected int k_min, k_max;
+
+ /**
+ * Random number generator.
+ */
+ private RandomFactory random;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // Do NOT invoke super.makeOptions to hide the "k" parameter.
+ IntParameter kMinP = new IntParameter(K_MIN_ID, 2) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kMinP)) {
+ k_min = kMinP.intValue();
+ }
+ IntParameter kMaxP = new IntParameter(KMeans.K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kMaxP)) {
+ k_max = kMaxP.intValue();
+ }
+ // We allow k_min = k_max.
+ config.checkConstraint(new LessEqualGlobalConstraint<>(kMinP, kMaxP));
+ getParameterInitialization(config);
+ getParameterMaxIter(config);
+ getParameterDistanceFunction(config);
+
+ RandomParameter rndP = new RandomParameter(SEED_ID);
+ if(config.grab(rndP)) {
+ random = rndP.getValue();
+ }
+ splitInitializer = new PredefinedInitialMeans((List<Vector>) null);
+
+ ObjectParameter<KMeans<V, M>> innerKMeansP = new ObjectParameter<>(INNER_KMEANS_ID, KMeans.class, KMeansLloyd.class);
+ if(config.grab(innerKMeansP)) {
+ ListParameterization initialKMeansVariantParameters = new ListParameterization();
+ initialKMeansVariantParameters.addParameter(KMeans.K_ID, k_min);
+ initialKMeansVariantParameters.addParameter(KMeans.INIT_ID, splitInitializer);
+ initialKMeansVariantParameters.addParameter(KMeans.MAXITER_ID, maxiter);
+ initialKMeansVariantParameters.addParameter(KMeans.DISTANCE_FUNCTION_ID, distanceFunction);
+ ChainedParameterization combinedConfig = new ChainedParameterization(initialKMeansVariantParameters, config);
+ combinedConfig.errorsTo(config);
+ innerKMeans = innerKMeansP.instantiateClass(combinedConfig);
+ }
+
+ ObjectParameter<KMeansQualityMeasure<V>> informationCriterionP = new ObjectParameter<>(INFORMATION_CRITERION_ID, KMeansQualityMeasure.class);
+ if(config.grab(informationCriterionP)) {
+ informationCriterion = informationCriterionP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected XMeans<V, M> makeInstance() {
+ return new XMeans<V, M>(distanceFunction, k_min, k_max, maxiter, innerKMeans, initializer, splitInitializer, informationCriterion, random);
+ }
+ }
+}