diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
50 files changed, 2124 insertions, 481 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index 5712d814..ea441655 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index 95d88b93..108ba0ed 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -340,9 +340,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } } - if(processedIDs.size() == distFunc.getRelation().size() && noise.size() == 0) { + /* if(processedIDs.size() == relation.size() && noise.size() == 0) { break; - } + } */ } if(currentCluster.size() >= minpts) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java index e28dbff3..5ec59777 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index 2576c5f6..b59af555 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -33,7 +33,6 @@ import de.lmu.ifi.dbs.elki.data.model.ClusterModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -132,7 +131,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor /** * Performs the DBSCAN algorithm on the given database. */ - public Clustering<Model> run(Database database, Relation<O> relation) { + public Clustering<Model> run(Relation<O> relation) { RangeQuery<O, D> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction()); final int size = relation.size(); @@ -142,9 +141,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(size); if(size >= minpts) { - for(DBID id : rangeQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { if(!processedIDs.contains(id)) { - expandCluster(database, rangeQuery, id, objprog, clusprog); + expandCluster(relation, rangeQuery, id, objprog, clusprog); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), logger); @@ -156,7 +155,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } } else { - for(DBID id : rangeQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { noise.add(id); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), logger); @@ -189,12 +188,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * <p/> * Border-Objects become members of the first possible cluster. * - * @param database the database on which the algorithm is run + * @param relation Database relation to run on * @param rangeQuery Range query to use * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object for logging the current status */ - protected void expandCluster(Database database, RangeQuery<O, D> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { + protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { List<DistanceResultPair<D>> seeds = rangeQuery.getRangeForDBID(startObjectID, epsilon); // startObject is no core-object @@ -245,7 +244,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } } - if(processedIDs.size() == rangeQuery.getRelation().size() && noise.size() == 0) { + if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index ca401ddc..f1e6c945 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index c1285659..a70a3f6f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,9 +25,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; import java.util.ArrayList; import java.util.List; -import java.util.Random; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyGeneratedInitialMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -42,6 +43,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; @@ -58,8 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualCons import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the EM algorithm (clustering by expectation maximization). @@ -113,6 +114,11 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri */ public static final OptionID DELTA_ID = OptionID.getOrCreateOptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta"); + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + private static final double MIN_LOGLIKELIHOOD = -100000; /** @@ -121,32 +127,27 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri private double delta; /** - * Parameter to specify the random generator seed. + * Store the individual probabilities, for use by EMOutlierDetection etc. */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("em.seed", "The random number generator seed."); + private WritableDataStore<double[]> probClusterIGivenX; /** - * Holds the value of {@link #SEED_ID}. + * Class to choose the initial means */ - private Long seed; - - /** - * Store the individual probabilities, for use by EMOutlierDetection etc. - */ - private WritableDataStore<double[]> probClusterIGivenX; + private KMeansInitialization<V> initializer; /** * Constructor. * * @param k k parameter * @param delta delta parameter - * @param seed Seed parameter + * @param initializer Class to choose the initial means */ - public EM(int k, double delta, Long seed) { + public EM(int k, double delta, KMeansInitialization<V> initializer) { super(); this.k = k; this.delta = delta; - this.seed = seed; + this.initializer = initializer; } /** @@ -169,14 +170,14 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri if(logger.isVerbose()) { logger.verbose("initializing " + k + " models"); } - List<V> means = initialMeans(relation); + List<Vector> means = initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC); List<Matrix> covarianceMatrices = new ArrayList<Matrix>(k); List<Double> normDistrFactor = new ArrayList<Double>(k); List<Matrix> invCovMatr = new ArrayList<Matrix>(k); List<Double> clusterWeights = new ArrayList<Double>(k); probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); - int dimensionality = means.get(0).getDimensionality(); + final int dimensionality = means.get(0).getDimensionality(); for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); covarianceMatrices.add(m); @@ -211,12 +212,12 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri em = emNew; // recompute models - List<V> meanSums = new ArrayList<V>(k); + List<Vector> meanSums = new ArrayList<Vector>(k); double[] sumOfClusterProbabilities = new double[k]; for(int i = 0; i < k; i++) { clusterWeights.set(i, 0.0); - meanSums.add(means.get(i).nullVector()); + meanSums.add(new Vector(dimensionality)); covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality)); } @@ -226,24 +227,23 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri for(int i = 0; i < k; i++) { sumOfClusterProbabilities[i] += clusterProbabilities[i]; - V summand = relation.get(id).multiplicate(clusterProbabilities[i]); - V currentMeanSum = meanSums.get(i).plus(summand); - meanSums.set(i, currentMeanSum); + Vector summand = relation.get(id).getColumnVector().timesEquals(clusterProbabilities[i]); + meanSums.get(i).plusEquals(summand); } } final int n = relation.size(); for(int i = 0; i < k; i++) { clusterWeights.set(i, sumOfClusterProbabilities[i] / n); - V newMean = meanSums.get(i).multiplicate(1 / sumOfClusterProbabilities[i]); + Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]); means.set(i, newMean); } // covariance matrices for(DBID id : relation.iterDBIDs()) { double[] clusterProbabilities = probClusterIGivenX.get(id); - V instance = relation.get(id); + Vector instance = relation.get(id).getColumnVector(); for(int i = 0; i < k; i++) { - V difference = instance.minus(means.get(i)); - covarianceMatrices.get(i).plusEquals(difference.getColumnVector().times(difference.getRowVector()).times(clusterProbabilities[i])); + Vector difference = instance.minus(means.get(i)); + covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i])); } } for(int i = 0; i < k; i++) { @@ -281,13 +281,14 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri } hardClusters.get(maxIndex).add(id); } + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); Clustering<EMModel<V>> result = new Clustering<EMModel<V>>("EM Clustering", "em-clustering"); // provide models within the result for(int i = 0; i < k; i++) { // TODO: re-do labeling. // SimpleClassLabel label = new SimpleClassLabel(); // label.init(result.canonicalClusterLabel(i)); - Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(means.get(i), covarianceMatrices.get(i))); + Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i))); result.addCluster(model); } return result; @@ -308,24 +309,20 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri * @param clusterWeights the weights of the current clusters * @return the expectation value of the current mixture of distributions */ - protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<V> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { + protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { double emSum = 0.0; for(DBID id : database.iterDBIDs()) { - V x = database.get(id); + Vector x = database.get(id).getColumnVector(); List<Double> probabilities = new ArrayList<Double>(k); for(int i = 0; i < k; i++) { - V difference = x.minus(means.get(i)); - Matrix differenceRow = difference.getRowVector(); - Vector differenceCol = difference.getColumnVector(); - Matrix rowTimesCov = differenceRow.times(invCovMatr.get(i)); - Vector rowTimesCovTimesCol = rowTimesCov.times(differenceCol); - double power = rowTimesCovTimesCol.get(0, 0) / 2.0; + Vector difference = x.minus(means.get(i)); + double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference); + double power = rowTimesCovTimesCol / 2.0; double prob = normDistrFactor.get(i) * Math.exp(-power); if(logger.isDebuggingFinest()) { - logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " differenceRow:\n" + FormatUtil.format(differenceRow, " ") + "\n" + " differenceCol:\n" + FormatUtil.format(differenceCol, " ") + "\n" + " rowTimesCov:\n" + FormatUtil.format(rowTimesCov, " ") + "\n" + " rowTimesCovTimesCol:\n" + FormatUtil.format(rowTimesCovTimesCol, " ") + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); + logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); } - probabilities.add(prob); } double priorProbability = 0.0; @@ -356,48 +353,6 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri } /** - * Creates {@link #k k} random points distributed uniformly within the - * attribute ranges of the given database. - * - * @param relation the database must contain enough points in order to - * ascertain the range of attribute values. Less than two points would - * make no sense. The content of the database is not touched otherwise. - * @return a list of {@link #k k} random points distributed uniformly within - * the attribute ranges of the given database - */ - protected List<V> initialMeans(Relation<V> relation) { - final Random random; - if(this.seed != null) { - random = new Random(this.seed); - } - else { - random = new Random(); - } - if(relation.size() > 0) { - final int dim = DatabaseUtil.dimensionality(relation); - Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<V> means = new ArrayList<V>(k); - if(logger.isVerbose()) { - logger.verbose("initializing random vectors"); - } - for(int i = 0; i < k; i++) { - double[] r = MathUtil.randomDoubleArray(dim, random); - // Rescale - for (int d = 0; d < dim; d++) { - r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; - } - // Instantiate - V randomVector = minmax.first.newInstance(r); - means.add(randomVector); - } - return means; - } - else { - return new ArrayList<V>(0); - } - } - - /** * Get the probabilities for a given point. * * @param index Point ID @@ -429,7 +384,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri protected double delta; - protected Long seed; + protected KMeansInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { @@ -439,20 +394,20 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri k = kP.getValue(); } + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + DoubleParameter deltaP = new DoubleParameter(DELTA_ID, new GreaterEqualConstraint(0.0), 0.0); if(config.grab(deltaP)) { delta = deltaP.getValue(); } - - LongParameter seedP = new LongParameter(SEED_ID, true); - if(config.grab(seedP)) { - seed = seedP.getValue(); - } } @Override protected EM<V> makeInstance() { - return new EM<V>(k, delta, seed); + return new EM<V>(k, delta, initializer); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java deleted file mode 100644 index 38ea89c2..00000000 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java +++ /dev/null @@ -1,307 +0,0 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2011 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Random; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.data.Cluster; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.math.MathUtil; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; -import de.lmu.ifi.dbs.elki.utilities.documentation.Description; -import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; - -/** - * Provides the k-means algorithm. - * <p> - * Reference: J. MacQueen: Some Methods for Classification and Analysis of - * Multivariate Observations. <br> - * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297. - * </p> - * - * @author Arthur Zimek - * - * @apiviz.has MeanModel - * - * @param <D> a type of {@link Distance} as returned by the used distance - * function - * @param <V> a type of {@link NumberVector} as a suitable datatype for this - * algorithm - */ -@Title("K-Means") -@Description("Finds a partitioning into k clusters.") -@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992") -public class KMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<V, D, Clustering<MeanModel<V>>> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { - /** - * The logger for this class. - */ - private static final Logging logger = Logging.getLogger(KMeans.class); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater or equal to 0, where 0 means no limit. - */ - public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); - - /** - * Parameter to specify the random generator seed. - */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); - - /** - * Holds the value of {@link #K_ID}. - */ - private int k; - - /** - * Holds the value of {@link #MAXITER_ID}. - */ - private int maxiter; - - /** - * Holds the value of {@link #SEED_ID}. - */ - private Long seed; - - /** - * Constructor. - * - * @param distanceFunction distance function - * @param k k parameter - * @param maxiter Maxiter parameter - * @param seed Random generator seed - */ - public KMeans(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, Long seed) { - super(distanceFunction); - this.k = k; - this.maxiter = maxiter; - this.seed = seed; - } - - /** - * Run k-means - * - * @param database Database - * @param relation relation to use - * @return result - * @throws IllegalStateException - */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { - final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); - if(relation.size() > 0) { - final int dim = DatabaseUtil.dimensionality(relation); - Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<V> means = new ArrayList<V>(k); - List<V> oldMeans; - if(logger.isVerbose()) { - logger.verbose("initializing random vectors"); - } - for(int i = 0; i < k; i++) { - double[] r = MathUtil.randomDoubleArray(dim, random); - // Rescale - for (int d = 0; d < dim; d++) { - r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; - } - // Instantiate - V randomVector = minmax.first.newInstance(r); - means.add(randomVector); - } - List<? extends ModifiableDBIDs> clusters; - clusters = sort(means, relation); - boolean changed = true; - int iteration = 1; - while(changed) { - if(logger.isVerbose()) { - logger.verbose("iteration " + iteration); - } - oldMeans = new ArrayList<V>(means); - means = means(clusters, means, relation); - clusters = sort(means, relation); - changed = !means.equals(oldMeans); - iteration++; - - if(maxiter > 0 && iteration > maxiter) { - break; - } - } - Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); - for(int i = 0; i < clusters.size(); i++) { - DBIDs ids = clusters.get(i); - MeanModel<V> model = new MeanModel<V>(means.get(i)); - result.addCluster(new Cluster<MeanModel<V>>(ids, model)); - } - return result; - } - else { - return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); - } - } - - /** - * Returns the mean vectors of the given clusters in the given database. - * - * @param clusters the clusters to compute the means - * @param means the recent means - * @param database the database containing the vectors - * @return the mean vectors of the given clusters in the given database - */ - protected List<V> means(List<? extends ModifiableDBIDs> clusters, List<V> means, Relation<V> database) { - List<V> newMeans = new ArrayList<V>(k); - for(int i = 0; i < k; i++) { - ModifiableDBIDs list = clusters.get(i); - V mean = null; - for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) { - if(mean == null) { - mean = database.get(clusterIter.next()); - } - else { - mean = mean.plus(database.get(clusterIter.next())); - } - } - if(list.size() > 0) { - assert mean != null; - mean = mean.multiplicate(1.0 / list.size()); - } - else { - mean = means.get(i); - } - newMeans.add(mean); - } - return newMeans; - } - - /** - * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of - * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. - * - * @param means a list of k means - * @param database the database to cluster - * @return list of k clusters - */ - protected List<? extends ModifiableDBIDs> sort(List<V> means, Relation<V> database) { - List<ArrayModifiableDBIDs> clusters = new ArrayList<ArrayModifiableDBIDs>(k); - for(int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newArray()); - } - - for(DBID id : database.iterDBIDs()) { - List<D> distances = new ArrayList<D>(k); - V fv = database.get(id); - int minIndex = 0; - for(int d = 0; d < k; d++) { - distances.add(getDistanceFunction().distance(fv, means.get(d))); - if(distances.get(d).compareTo(distances.get(minIndex)) < 0) { - minIndex = d; - } - } - clusters.get(minIndex).add(id); - } - for(ArrayModifiableDBIDs cluster : clusters) { - Collections.sort(cluster); - } - return clusters; - } - - @Override - public TypeInformation[] getInputTypeRestriction() { - return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); - } - - @Override - protected Logging getLogger() { - return logger; - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { - protected int k; - - protected int maxiter; - - protected Long seed; - - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); - if(config.grab(kP)) { - k = kP.getValue(); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); - if(config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); - } - - LongParameter seedP = new LongParameter(SEED_ID, true); - if(config.grab(seedP)) { - seed = seedP.getValue(); - } - } - - @Override - protected KMeans<V, D> makeInstance() { - return new KMeans<V, D>(distanceFunction, k, maxiter, seed); - } - } -}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java index 24985e24..2244b07b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java index c233963d..d6c5872a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java index f7bd10c7..41e48b89 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java index e1329888..45b12c43 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -133,18 +133,18 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori public Result run(Database database, Relation<O> relation) { DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass(); - WritableRecordStore store = DataStoreUtil.makeRecordStorage(distQuery.getRelation().getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls); + WritableRecordStore store = DataStoreUtil.makeRecordStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls); pi = store.getStorage(0, DBID.class); lambda = store.getStorage(1, distCls); // Temporary storage for m. - WritableDataStore<D> m = DataStoreUtil.makeStorage(distQuery.getRelation().getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); + WritableDataStore<D> m = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); - FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", distQuery.getRelation().size(), logger) : null; + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", relation.size(), logger) : null; // has to be an array for monotonicity reasons! - ModifiableDBIDs processedIDs = DBIDUtil.newArray(distQuery.getRelation().size()); + ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size()); // apply the algorithm - for(DBID id : distQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { step1(id); step2(id, processedIDs, distQuery, m); step3(id, processedIDs, m); @@ -168,8 +168,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori BasicResult result = null; // Build clusters identified by their target object - int minc = minclusters != null ? minclusters : distQuery.getRelation().size(); - result = extractClusters(distQuery.getRelation().getDBIDs(), pi, lambda, minc); + int minc = minclusters != null ? minclusters : relation.size(); + result = extractClusters(relation.getDBIDs(), pi, lambda, minc); result.addChildResult(new MaterializedRelation<DBID>("SLINK pi", "slink-order", TypeUtil.DBID, pi, processedIDs)); result.addChildResult(new MaterializedRelation<D>("SLINK lambda", "slink-order", new SimpleTypeInformation<D>(distCls), lambda, processedIDs)); @@ -288,7 +288,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori D stopdist = null; // sort by lambda ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); - Collections.sort(order, new CompareByLambda<D>(lambda)); + order.sort(new CompareByLambda<D>(lambda)); int index = ids.size() - minclusters - 1; while(index >= 0) { if(lambda.get(order.get(index)).equals(lambda.get(order.get(index + 1)))) { @@ -458,7 +458,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori // extract a hierarchical clustering ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); // sort by lambda - Collections.sort(order, new CompareByLambda<D>(lambda)); + order.sort(new CompareByLambda<D>(lambda)); D curdist = null; D stopdist = null; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index 3bde2932..7c3a13c9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; import java.util.ArrayList; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; @@ -36,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -200,8 +200,8 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple * @return the shared nearest neighbors of the specified query object in the * given database */ - protected List<DBID> findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) { - List<DBID> neighbors = new LinkedList<DBID>(); + protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) { + ArrayModifiableDBIDs neighbors = DBIDUtil.newArray(); for(DBID id : snnInstance.getRelation().iterDBIDs()) { if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) { neighbors.add(id); @@ -222,7 +222,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple * clustering */ protected void expandCluster(SimilarityQuery<O, IntegerDistance> snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { - List<DBID> seeds = findSNNNeighbors(snnInstance, startObjectID); + ArrayModifiableDBIDs seeds = findSNNNeighbors(snnInstance, startObjectID); // startObject is no core-object if(seeds.size() < minpts) { @@ -247,11 +247,10 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple noise.remove(seed); } } - seeds.remove(0); while(seeds.size() > 0) { - DBID o = seeds.remove(0); - List<DBID> neighborhood = findSNNNeighbors(snnInstance, o); + DBID o = seeds.remove(seeds.size() - 1); + ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o); if(neighborhood.size() >= minpts) { for(DBID p : neighborhood) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index 4a0b391c..b877415e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.datasource.filter.NonNumericFeaturesException; +import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException; import de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -84,7 +84,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * Provides the CASH algorithm, an subspace clustering algorithm based on the - * hough transform. + * Hough transform. + * + * <b>Note:</b> CASH requires explicitly setting the input parser other than default to + * {@link de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser}: + * (in the MiniGui, set option: dbc.parser ParameterizationFunctionLabelParser). + * * <p> * Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust * clustering in arbitrarily oriented subspaces. <br> @@ -99,7 +104,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; */ // todo elke hierarchy (later) @Title("CASH: Robust clustering in arbitrarily oriented subspaces") -@Description("Subspace clustering algorithm based on the hough transform.") +@Description("Subspace clustering algorithm based on the Hough transform.") @Reference(authors = "E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek", title = "Robust clustering in arbitraily oriented subspaces", booktitle = "Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008", url = "http://www.siam.org/proceedings/datamining/2008/dm08_69_AchtertBoehmDavidKroegerZimek.pdf") public class CASH extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { /** @@ -349,7 +354,7 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste res.addCluster(c); noiseIDs.removeDBIDs(interval.getIDs()); clusterIDs.addDBIDs(interval.getIDs()); - processedIDs.addAll(interval.getIDs()); + processedIDs.addDBIDs(interval.getIDs()); } // Rebuild heap @@ -372,13 +377,13 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste if(dim == noiseDim) { Cluster<Model> c = new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER); res.addCluster(c); - processedIDs.addAll(noiseIDs); + processedIDs.addDBIDs(noiseIDs); } else if(noiseIDs.size() >= minPts) { LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs); Cluster<Model> c = new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)); res.addCluster(c); - processedIDs.addAll(noiseIDs); + processedIDs.addDBIDs(noiseIDs); } } @@ -521,7 +526,7 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste private ParameterizationFunction project(Matrix basis, ParameterizationFunction f) { // Matrix m = new Matrix(new // double[][]{f.getPointCoordinates()}).times(basis); - Matrix m = f.getRowVector().times(basis); + Matrix m = f.getColumnVector().transposeTimes(basis); ParameterizationFunction f_t = new ParameterizationFunction(m.getColumnPackedCopy()); return f_t; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 8fc30b3d..575bf117 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java index 75633853..af4f677f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -244,7 +244,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste } else { ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs()); - merged.addAll(clus.getIDs().asCollection()); + merged.addDBIDs(clus.getIDs()); noise.setIDs(merged); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java index 93d0cc99..98761962 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java index 92723428..1065682c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java new file mode 100644 index 00000000..41ee1f69 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -0,0 +1,566 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.histograms.FlexiHistogram; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; + +/** + * Linear manifold clustering in high dimensional spaces by stochastic search. + * + * Reference: + * <p> + * Robert Haralick, Rave Harpaz<br /> + * Linear manifold clustering in high dimensional spaces by stochastic search<br/> + * In: Pattern Recognition volume 40, Issue 10 + * </p> + * + * Implementation note: the LMCLUS algorithm seems to lack good stopping + * criterions. We can't entirely reproduce the good results from the original + * publication, in particular not on noisy data. But the questionable parts are + * as in the original publication, associated thesis and published source code. + * The minimum cluster size however can serve as a hidden stopping criterion. + * + * @author Ernst Waas + * @author Erich Schubert + */ +@Reference(authors = "Robert Haralick, Rave Harpaz", title = "Linear manifold clustering in high dimensional spaces by stochastic search", booktitle = "Pattern Recognition volume 40, Issue 10", url = "http://dx.doi.org/10.1016/j.patcog.2007.01.020") +public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(LMCLUS.class); + + /** + * Epsilon + */ + private final static double NOT_FROM_ONE_CLUSTER_PROBABILITY = 0.2; + + /** + * Histogram resolution + */ + private final static int BINS = 50; + + /** + * The current threshold value calculated by the findSeperation Method. + */ + private final double sensitivityThreshold; + + /** + * Maximum cluster dimensionality + */ + private final int maxLMDim; + + /** + * Minimum cluster size + */ + private final int minsize; + + /** + * Number of sampling rounds to find a good split + */ + private final int samplingLevel; + + /** + * Constructor. + * + * @param maxdim Maximum dimensionality + * @param minsize Minimum cluster size + * @param samplingLevel Sampling level + * @param sensitivityThreshold Threshold + */ + public LMCLUS(int maxdim, int minsize, int samplingLevel, double sensitivityThreshold) { + super(); + this.maxLMDim = maxdim; + this.minsize = minsize; + this.samplingLevel = samplingLevel; + this.sensitivityThreshold = sensitivityThreshold; + } + + /** + * The main LMCLUS (Linear manifold clustering algorithm) is processed in this + * method. + * + * <PRE> + * The algorithm samples random linear manifolds and tries to find clusters in it. + * It calculates a distance histogram searches for a threshold and partitions the + * points in two groups the ones in the cluster and everything else. + * Then the best fitting linear manifold is searched and registered as a cluster. + * The process is started over until all points are clustered. + * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.) + * For details see {@link LMCLUS}. + * </PRE> + * + * @param database The database to operate on + * @param relation Relation + * @return Clustering result + * @throws de.lmu.ifi.dbs.elki.utilities.UnableToComplyException + */ + public Clustering<Model> run(Database database, Relation<NumberVector<?, ?>> relation) throws UnableToComplyException { + Clustering<Model> ret = new Clustering<Model>("LMCLUS Clustering", "lmclus-clustering"); + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), logger) : null; + IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters found", logger) : null; + ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs()); + + final int maxdim = Math.min(maxLMDim, DatabaseUtil.dimensionality(relation)); + int cnum = 0; + while(unclustered.size() > minsize) { + DBIDs current = unclustered; + int lmDim = 1; + for(int k = 1; k <= maxdim; k++) { + // Implementation note: this while loop is from the original publication + // and the published LMCLUS source code. It doesn't make sense to me - + // it is lacking a stop criterion other than "cluster is too small" and + // "cluster is inseparable"! Additionally, there is good criterion for + // stopping at the appropriate dimensionality either. + while(true) { + Separation separation = findSeparation(relation, current, k); + // logger.verbose("k: " + k + " goodness: " + separation.goodness + + // " threshold: " + separation.threshold); + if(separation.goodness <= sensitivityThreshold) { + break; + } + ModifiableDBIDs subset = DBIDUtil.newArray(current.size()); + for(DBID id : current) { + if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { + subset.add(id); + } + } + // logger.verbose("size:"+subset.size()); + if(subset.size() < minsize) { + break; + } + current = subset; + lmDim = k; + // System.out.println("Partition: " + subset.size()); + } + } + // No more clusters found + if(current.size() < minsize || current == unclustered) { + break; + } + // New cluster found + // TODO: annotate cluster with dimensionality + final Cluster<Model> cluster = new Cluster<Model>(current); + cluster.setName("Cluster_" + lmDim + "d_" + cnum); + cnum++; + ret.addCluster(cluster); + // Remove from main working set. + unclustered.removeDBIDs(current); + if(progress != null) { + progress.setProcessed(relation.size() - unclustered.size(), logger); + } + if(cprogress != null) { + cprogress.setProcessed(cnum, logger); + } + } + // Remaining objects are noise + if(unclustered.size() > 0) { + ret.addCluster(new Cluster<Model>(unclustered, true)); + } + if(progress != null) { + progress.setProcessed(relation.size(), logger); + progress.ensureCompleted(logger); + } + if(cprogress != null) { + cprogress.setCompleted(logger); + } + return ret; + } + + /** + * Deviation from a manifold described by beta. + * + * @param delta Delta from origin vector + * @param beta Manifold + * @return Deviation score + */ + private double deviation(Vector delta, Matrix beta) { + double a = delta.euclideanLength(); + double b = beta.transposeTimes(delta).euclideanLength(); + return Math.sqrt((a * a) - (b * b)); + } + + /** + * This method samples a number of linear manifolds an tries to determine + * which the one with the best cluster is. + * + * <PRE> + * A number of sample points according to the dimension of the linear manifold are taken. + * The basis (B) and the origin(o) of the manifold are calculated. + * A distance histogram using the distance function ||x-o|| -||B^t*(x-o)|| is generated. + * The best threshold is searched using the elevate threshold function. + * The overall goodness of the threshold is determined. + * The process is redone until a specific number of samples is taken. + * </PRE> + * + * @param relation The vector relation + * @param currentids Current DBIDs + * @param dimension the dimension of the linear manifold to sample. + * @return the overall goodness of the separation. The values origin basis and + * threshold are returned indirectly over class variables. + */ + private Separation findSeparation(Relation<NumberVector<?, ?>> relation, DBIDs currentids, int dimension) { + Separation separation = new Separation(); + // determine the number of samples needed, to secure that with a specific + // probability + // in at least on sample every sampled point is from the same cluster. + int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size()); + // System.out.println("Number of samples: " + samples); + Random r = new Random(); + int remaining_retries = 100; + for(int i = 1; i <= samples; i++) { + DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong()); + final Iterator<DBID> iter = sample.iterator(); + // Use first as origin + DBID origin = iter.next(); + Vector originV = relation.get(origin).getColumnVector(); + // Build orthogonal basis from remainder + Matrix basis; + { + List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1); + while(iter.hasNext()) { + Vector vec = relation.get(iter.next()).getColumnVector(); + vectors.add(vec.minusEquals(originV)); + } + // generate orthogonal basis + basis = generateOrthonormalBasis(vectors); + if(basis == null) { + // new sample has to be taken. + i--; + remaining_retries--; + if(remaining_retries < 0) { + throw new AbortException("Too many retries in sampling, and always a linear dependant data set."); + } + continue; + } + } + // Generate and fill a histogram. + FlexiHistogram<Double, Double> histogram = FlexiHistogram.DoubleSumHistogram(BINS); + double w = 1.0 / currentids.size(); + for(DBID point : currentids) { + // Skip sampled points + if(sample.contains(point)) { + continue; + } + Vector vec = relation.get(point).getColumnVector().minusEquals(originV); + final double distance = deviation(vec, basis); + histogram.aggregate(distance, w); + } + double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold + if(th[1] > separation.goodness) { + separation.goodness = th[1]; + separation.threshold = th[0]; + separation.originV = originV; + separation.basis = basis; + } + } + return separation; + } + + /** + * This Method generates an orthonormal basis from a set of Vectors. It uses + * the established Gram-Schmidt algorithm for orthonormalisation: + * + * <PRE> + * u_1 = v_1 + * u_k = v_k -proj_u1(v_k)...proj_u(k-1)(v_k); + * + * Where proj_u(v) = <v,u>/<u,u> *u + * </PRE> + * + * @param vectors The set of vectors to generate the orthonormal basis from + * @return the orthonormal basis generated by this method. + * @throws RuntimeException if the given vectors are not linear independent. + */ + private Matrix generateOrthonormalBasis(List<Vector> vectors) { + Vector first = vectors.get(0); + first = first.times(1.0 / first.euclideanLength()); + Matrix ret = new Matrix(first.getDimensionality(), vectors.size()); + ret.setCol(0, first); + for(int i = 1; i < vectors.size(); i++) { + // System.out.println("Matrix:" + ret); + Vector v_i = vectors.get(i); + Vector u_i = v_i.copy(); + // System.out.println("Vector " + i + ":" + partialSol); + for(int j = 0; j < i; j++) { + Vector v_j = ret.getCol(j); + double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j); + if(Double.isNaN(f)) { + if(logger.isDebuggingFine()) { + logger.debugFine("Zero vector encountered? " + v_j); + } + return null; + } + u_i.minusTimesEquals(v_j, f); + } + // check if the vectors weren't independent + final double len_u_i = u_i.euclideanLength(); + if(len_u_i == 0.0) { + if(logger.isDebuggingFine()) { + logger.debugFine("Points not independent - no orthonormalization."); + } + return null; + } + // System.out.println("Vector " + i + ":" + partialSol); + u_i.timesEquals(1 / len_u_i); + ret.setCol(i, u_i); + } + return ret; + } + + /** + * Evaluate the histogram to find a suitable threshold + * + * @param histogram Histogram to evaluate + * @return Position and goodness + */ + private double[] findAndEvaluateThreshold(FlexiHistogram<Double, Double> histogram) { + int n = histogram.getNumBins(); + double[] p1 = new double[n]; + double[] p2 = new double[n]; + double[] mu1 = new double[n]; + double[] mu2 = new double[n]; + double[] sigma1 = new double[n]; + double[] sigma2 = new double[n]; + double[] jt = new double[n]; + // Forward pass + { + MeanVariance mv = new MeanVariance(); + Iterator<DoubleObjPair<Double>> forward = histogram.iterator(); + for(int i = 0; forward.hasNext(); i++) { + DoubleObjPair<Double> pair = forward.next(); + p1[i] = pair.second + ((i > 0) ? p1[i - 1] : 0); + mv.put(i, pair.second); + mu1[i] = mv.getMean(); + sigma1[i] = mv.getNaiveStddev(); + } + } + // Backwards pass + { + MeanVariance mv = new MeanVariance(); + Iterator<DoubleObjPair<Double>> backwards = histogram.reverseIterator(); + for(int j = n - 1; backwards.hasNext(); j--) { + DoubleObjPair<Double> pair = backwards.next(); + p2[j] = pair.second + ((j + 1 < n) ? p2[j + 1] : 0); + mv.put(j, pair.second); + mu2[j] = mv.getMean(); + sigma2[j] = mv.getNaiveStddev(); + } + } + + for(int i = 0; i < n; i++) { + jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i]))); + } + + int bestpos = -1; + double bestgoodness = Double.NEGATIVE_INFINITY; + + double devPrev = jt[1] - jt[0]; + for(int i = 1; i < jt.length - 1; i++) { + double devCur = jt[i + 1] - jt[i]; + // System.out.println(p1[i]); + // System.out.println(jt[i + 1]); + // System.out.println(jt[i]); + // System.out.println(devCur); + // Local minimum found - calculate depth + if(devCur >= 0 && devPrev <= 0) { + double lowestMaxima = Double.POSITIVE_INFINITY; + for(int j = i - 1; j > 0; j--) { + if(jt[j - 1] < jt[j]) { + lowestMaxima = Math.min(lowestMaxima, jt[j]); + break; + } + } + for(int j = i + 1; j < n - 2; j++) { + if(jt[j + 1] < jt[j]) { + lowestMaxima = Math.min(lowestMaxima, jt[j]); + break; + } + } + double localDepth = lowestMaxima - jt[i]; + + final double mud = mu1[i] - mu2[i]; + double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]); + if(Double.isNaN(discriminability)) { + discriminability = -1; + } + double goodness = localDepth * discriminability; + if(goodness > bestgoodness) { + bestgoodness = goodness; + bestpos = i; + } + } + devPrev = devCur; + } + return new double[] { histogram.getBinMax(bestpos), bestgoodness }; + } + + @Override + protected Logging getLogger() { + return logger; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + /** + * Class to represent a linear manifold separation + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private static class Separation { + /** + * Goodness of separation + */ + double goodness = Double.NEGATIVE_INFINITY; + + /** + * Threshold + */ + double threshold = Double.NEGATIVE_INFINITY; + + /** + * Basis of manifold + */ + Matrix basis = null; + + /** + * Origin vector + */ + Vector originV = null; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter with the maximum dimension to search for + */ + public static final OptionID MAXDIM_ID = OptionID.getOrCreateOptionID("lmclus.maxdim", "Maximum linear manifold dimension to search."); + + /** + * Parameter for the minimum cluster size + */ + public static final OptionID MINSIZE_ID = OptionID.getOrCreateOptionID("lmclus.minsize", "Minimum cluster size to allow."); + + /** + * Sampling intensity level + */ + public static final OptionID SAMPLINGL_ID = OptionID.getOrCreateOptionID("lmclus.sampling-level", "A number used to determine how many samples are taken in each search."); + + /** + * Global significance threshold + */ + public static final OptionID THRESHOLD_ID = OptionID.getOrCreateOptionID("lmclus.threshold", "Threshold to determine if a cluster was found."); + + /** + * Maximum dimensionality to search for + */ + private int maxdim = Integer.MAX_VALUE; + + /** + * Minimum cluster size. + */ + private int minsize; + + /** + * Sampling level + */ + private int samplingLevel; + + /** + * Threshold + */ + private double threshold; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter maxLMDimP = new IntParameter(MAXDIM_ID, new GreaterEqualConstraint(1), true); + if(config.grab(maxLMDimP)) { + maxdim = maxLMDimP.getValue(); + } + IntParameter minsizeP = new IntParameter(MINSIZE_ID, new GreaterEqualConstraint(1)); + if(config.grab(minsizeP)) { + minsize = minsizeP.getValue(); + } + IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100); + if(config.grab(samplingLevelP)) { + samplingLevel = samplingLevelP.getValue(); + } + DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID); + if(config.grab(sensivityThresholdP)) { + threshold = sensivityThresholdP.getValue(); + } + } + + @Override + protected LMCLUS makeInstance() { + return new LMCLUS(maxdim, minsize, samplingLevel, threshold); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index 924e1786..eb5608fc 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -50,6 +50,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.SortedEigenPairs; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAResult; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; @@ -420,14 +421,14 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust /** * Returns the union of the two specified clusters. * - * @param database the database holding the objects + * @param relation the database holding the objects * @param distFunc the distance function * @param c1 the first cluster * @param c2 the second cluster * @param dim the dimensionality of the union cluster * @return the union of the two specified clusters */ - private ORCLUSCluster union(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster c1, ORCLUSCluster c2, int dim) { + private ORCLUSCluster union(Relation<V> relation, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster c1, ORCLUSCluster c2, int dim) { ORCLUSCluster c = new ORCLUSCluster(); c.objectIDs = DBIDUtil.newHashSet(c1.objectIDs); @@ -436,11 +437,13 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust c.objectIDs = DBIDUtil.newArray(c.objectIDs); if(c.objectIDs.size() > 0) { - c.centroid = DatabaseUtil.centroid(database, c.objectIDs); - c.basis = findBasis(database, distFunc, c, dim); + c.centroid = DatabaseUtil.centroid(relation, c.objectIDs); + c.basis = findBasis(relation, distFunc, c, dim); } else { - c.centroid = c1.centroid.plus(c2.centroid).multiplicate(0.5); + V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5); + c.centroid = factory.newNumberVector(cent.getArrayRef()); double[][] doubles = new double[c1.basis.getRowDimensionality()][dim]; for(int i = 0; i < dim; i++) { doubles[i][i] = 1; @@ -460,9 +463,9 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust * @return the projection of double vector o in the subspace of cluster c */ private V projection(ORCLUSCluster c, V o, V factory) { - Matrix o_proj = o.getRowVector().times(c.basis); + Matrix o_proj = o.getColumnVector().transposeTimes(c.basis); double[] values = o_proj.getColumnPackedCopy(); - return factory.newInstance(values); + return factory.newNumberVector(values); } @Override @@ -523,7 +526,7 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust for(int d = 1; d <= o.getDimensionality(); d++) { values[d - 1] = o.doubleValue(d); } - this.centroid = factory.newInstance(values); + this.centroid = factory.newNumberVector(values); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java index 6c5db740..46112498 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java index 62ff658f..86e045cb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java index 8cd156e8..8b6d104c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java index 82a1f1e1..665de632 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java new file mode 100644 index 00000000..d3c73b53 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -0,0 +1,310 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Abstract base class for k-means implementations. + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> { + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater or equal to 0, where 0 means no limit. + */ + public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); + + /** + * Parameter to specify the random generator seed. + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); + + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + + /** + * Holds the value of {@link #K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link #MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMeansInitialization<V> initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param relation the database to cluster + * @param means a list of k means + * @param clusters cluster assignment + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) { + boolean changed = false; + + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); + for(DBID id : relation.iterDBIDs()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.doubleDistance(fv, means.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + if(clusters.get(minIndex).add(id)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(id)) { + break; + } + } + } + } + } + } + else { + final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); + for(DBID id : relation.iterDBIDs()) { + D mindist = df.getDistanceFactory().infiniteDistance(); + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + D dist = df.distance(fv, means.get(i)); + if(dist.compareTo(mindist) < 0) { + minIndex = i; + mindist = dist; + } + } + if(clusters.get(minIndex).add(id)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(id)) { + break; + } + } + } + } + } + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + /** + * Returns the mean vectors of the given clusters in the given database. + * + * @param clusters the clusters to compute the means + * @param means the recent means + * @param database the database containing the vectors + * @return the mean vectors of the given clusters in the given database + */ + protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) { + List<Vector> newMeans = new ArrayList<Vector>(k); + for(int i = 0; i < k; i++) { + ModifiableDBIDs list = clusters.get(i); + Vector mean = null; + for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) { + if(mean == null) { + mean = database.get(clusterIter.next()).getColumnVector(); + } + else { + mean.plusEquals(database.get(clusterIter.next()).getColumnVector()); + } + } + if(list.size() > 0) { + assert mean != null; + mean.timesEquals(1.0 / list.size()); + } + else { + mean = means.get(i); + } + newMeans.add(mean); + } + return newMeans; + } + + /** + * Compute an incremental update for the mean + * + * @param mean Mean to update + * @param vec Object vector + * @param newsize (New) size of cluster + * @param op Cluster size change / Weight change + */ + protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) { + if(newsize == 0) { + return; // Keep old mean + } + Vector delta = vec.getColumnVector(); + // Compute difference from mean + delta.minusEquals(mean); + delta.timesEquals(op / newsize); + mean.plusEquals(delta); + } + + /** + * Perform a MacQueen style iteration. + * + * @param relation Relation + * @param means Means + * @param clusters Clusters + * @return true when the means have changed + */ + protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { + boolean changed = false; + + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + // Raw distance function + @SuppressWarnings("unchecked") + final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); + + // Incremental update + for(DBID id : relation.iterDBIDs()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + double dist = df.doubleDistance(fv, means.get(i)); + if(dist < mindist) { + minIndex = i; + mindist = dist; + } + } + // Update the cluster mean incrementally: + for(int i = 0; i < k; i++) { + ModifiableDBIDs ci = clusters.get(i); + if(i == minIndex) { + if(ci.add(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + changed = true; + } + } + else if(ci.remove(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + changed = true; + } + } + } + } + else { + // Raw distance function + final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); + + // Incremental update + for(DBID id : relation.iterDBIDs()) { + D mindist = df.getDistanceFactory().infiniteDistance(); + V fv = relation.get(id); + int minIndex = 0; + for(int i = 0; i < k; i++) { + D dist = df.distance(fv, means.get(i)); + if(dist.compareTo(mindist) < 0) { + minIndex = i; + mindist = dist; + } + } + // Update the cluster mean incrementally: + for(int i = 0; i < k; i++) { + ModifiableDBIDs ci = clusters.get(i); + if(i == minIndex) { + if(ci.add(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + changed = true; + } + } + else if(ci.remove(id)) { + incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + changed = true; + } + } + } + } + return changed; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java new file mode 100644 index 00000000..b5f088fb --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -0,0 +1,71 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; + +/** + * Abstract base class for common k-means initializations. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> { + /** + * Holds the value of {@link KMeansLloyd#SEED_ID}. + */ + protected Long seed; + + /** + * Constructor. + * + * @param seed Random seed. + */ + public AbstractKMeansInitialization(Long seed) { + this.seed = seed; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + protected Long seed; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true); + if(config.grab(seedP)) { + seed = seedP.getValue(); + } + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java new file mode 100644 index 00000000..78ccd426 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -0,0 +1,74 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Initialize K-means by using the first k objects as initial means. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + */ + public FirstKInitialMeans() { + super(null); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + Iterator<DBID> iter = relation.iterDBIDs(); + List<Vector> means = new ArrayList<Vector>(k); + for(int i = 0; i < k && iter.hasNext(); i++) { + means.add(relation.get(iter.next()).getColumnVector()); + } + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + @Override + protected FirstKInitialMeans<V> makeInstance() { + return new FirstKInitialMeans<V>(); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java new file mode 100644 index 00000000..f4c0d9c7 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -0,0 +1,49 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Interface for initializing K-Means + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public interface KMeansInitialization<V extends NumberVector<V, ?>> { + /** + * Choose initial means + * + * @param relation Relation + * @param k Parameter k + * @param distanceFunction Distance function + * @return List of chosen means for k-means + */ + public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java new file mode 100644 index 00000000..fda1d6c0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -0,0 +1,176 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-means algorithm, using Lloyd-style bulk iterations. + * + * <p> + * Reference:<br /> + * S. Lloyd<br/> + * Least squares quantization in PCM<br/> + * IEEE Transactions on Information Theory 28 (2)<br/> + * previously published as Bell Telephone Laboratories Paper + * </p> + * + * @author Arthur Zimek + * + * @apiviz.has MeanModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +@Title("K-Means") +@Description("Finds a partitioning into k clusters.") +@Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489") +public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMeansLloyd.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-means + * + * @param database Database + * @param relation relation to use + * @return result + * @throws IllegalStateException + */ + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + if(relation.size() <= 0) { + return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Means iteration " + (iteration + 1)); + } + boolean changed = assignToNearestCluster(relation, means, clusters); + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute means. + means = means(clusters, means, relation); + } + // Wrap result + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans<V, D> makeInstance() { + return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java new file mode 100644 index 00000000..56492dd0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -0,0 +1,177 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-means algorithm, using MacQueen style incremental updates. + * + * <p> + * Reference:<br /> + * J. MacQueen: Some Methods for Classification and Analysis of Multivariate + * Observations. <br /> + * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297. + * </p> + * + * @author Erich Schubert + * + * @apiviz.has MeanModel + * + * @param <V> vector type to use + * @param <D> distance function value type + */ +@Title("K-Means") +@Description("Finds a partitioning into k clusters.") +@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992") +public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMeansMacQueen.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-means + * + * @param database Database + * @param relation relation to use + * @return result + * @throws IllegalStateException + */ + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + if(relation.size() <= 0) { + return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Initialize cluster and assign objects + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + assignToNearestCluster(relation, means, clusters); + // Initial recomputation of the means. + means = means(clusters, means, relation); + + // Refine result + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Means iteration " + (iteration + 1)); + } + boolean changed = macQueenIterate(relation, means, clusters); + if(!changed) { + break; + } + } + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); + for(int i = 0; i < clusters.size(); i++) { + DBIDs ids = clusters.get(i); + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addCluster(new Cluster<MeanModel<V>>(ids, model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans<V, D> makeInstance() { + return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java new file mode 100644 index 00000000..c7a2fa1d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -0,0 +1,213 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.LoggingUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; + +/** + * K-Means++ initialization for k-means. + * + * Reference: + * <p> + * D. Arthur, S. Vassilvitskii<br /> + * k-means++: the advantages of careful seeding<br /> + * In: Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, + * SODA 2007 + * </p> + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494") +public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + * + * @param seed Random seed. + */ + public KMeansPlusPlusInitialMeans(Long seed) { + super(seed); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + // Get a distance query + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("K-Means++ initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; + DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + + // Chose first mean + List<Vector> means = new ArrayList<Vector>(k); + + Random random = (seed != null) ? new Random(seed) : new Random(); + DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next(); + means.add(relation.get(first).getColumnVector()); + + ModifiableDBIDs chosen = DBIDUtil.newHashSet(k); + chosen.add(first); + ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); + // Initialize weights + double[] weights = new double[ids.size()]; + double weightsum = initialWeights(weights, ids, first, distQ); + while(means.size() < k) { + if(weightsum > Double.MAX_VALUE) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?"); + } + if(weightsum < Double.MIN_NORMAL) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); + } + double r = random.nextDouble() * weightsum; + int pos = 0; + while(r > 0 && pos < weights.length) { + r -= weights[pos]; + pos++; + } + // Add new mean: + DBID newmean = ids.get(pos); + means.add(relation.get(newmean).getColumnVector()); + chosen.add(newmean); + // Update weights: + weights[pos] = 0.0; + // Choose optimized version for double distances, if applicable. + if (distF instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF; + weightsum = updateWeights(weights, ids, newmean, ddist, relation); + } else { + weightsum = updateWeights(weights, ids, newmean, distQ); + } + } + + return means; + } + + /** + * Initialize the weight list. + * + * @param weights Weight list + * @param ids IDs + * @param latest Added ID + * @param distQ Distance query + * @return Weight sum + */ + protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + double weightsum = 0.0; + DBIDIter it = ids.iter(); + for(int i = 0; i < weights.length; i++, it.advance()) { + DBID id = it.getDBID(); + if(latest.equals(id)) { + weights[i] = 0.0; + } + else { + double d = distQ.distance(latest, id).doubleValue(); + weights[i] = d * d; + } + weightsum += weights[i]; + } + return weightsum; + } + + /** + * Update the weight list. + * + * @param weights Weight list + * @param ids IDs + * @param latest Added ID + * @param distQ Distance query + * @return Weight sum + */ + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + double weightsum = 0.0; + DBIDIter it = ids.iter(); + for(int i = 0; i < weights.length; i++, it.advance()) { + DBID id = it.getDBID(); + if(weights[i] > 0.0) { + double d = distQ.distance(latest, id).doubleValue(); + weights[i] = Math.min(weights[i], d * d); + weightsum += weights[i]; + } + } + return weightsum; + } + + /** + * Update the weight list. + * + * @param weights Weight list + * @param ids IDs + * @param latest Added ID + * @param distF Distance function + * @return Weight sum + */ + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, PrimitiveDoubleDistanceFunction<V> distF, Relation<V> rel) { + final V lv = rel.get(latest); + double weightsum = 0.0; + DBIDIter it = ids.iter(); + for(int i = 0; i < weights.length; i++, it.advance()) { + DBID id = it.getDBID(); + if(weights[i] > 0.0) { + double d = distF.doubleDistance(lv, rel.get(id)); + weights[i] = Math.min(weights[i], d * d); + weightsum += weights[i]; + } + } + return weightsum; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + @Override + protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { + return new KMeansPlusPlusInitialMeans<V, D>(seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java new file mode 100644 index 00000000..30e59453 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -0,0 +1,78 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Initialize K-means by randomly choosing k exsiting elements as cluster + * centers. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + * + * @param seed Random seed. + */ + public RandomlyChosenInitialMeans(Long seed) { + super(seed); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed); + List<Vector> means = new ArrayList<Vector>(k); + for(DBID id : ids) { + means.add(relation.get(id).getColumnVector()); + } + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + + @Override + protected RandomlyChosenInitialMeans<V> makeInstance() { + return new RandomlyChosenInitialMeans<V>(seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java new file mode 100644 index 00000000..e8a466dd --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -0,0 +1,87 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Initialize k-means by generating random vectors (within the data sets value + * range). + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { + /** + * Constructor. + * + * @param seed Random seed. + */ + public RandomlyGeneratedInitialMeans(Long seed) { + super(seed); + } + + @Override + public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + final int dim = DatabaseUtil.dimensionality(relation); + Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); + List<Vector> means = new ArrayList<Vector>(k); + final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); + for(int i = 0; i < k; i++) { + double[] r = MathUtil.randomDoubleArray(dim, random); + // Rescale + for(int d = 0; d < dim; d++) { + r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; + } + means.add(new Vector(r)); + } + return means; + } + + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + + @Override + protected RandomlyGeneratedInitialMeans<V> makeInstance() { + return new RandomlyGeneratedInitialMeans<V>(seed); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java new file mode 100644 index 00000000..2ce625b0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java @@ -0,0 +1,26 @@ +/** + * <p>K-means clustering and variations.</p> + */ +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2012 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java index 660a7a4f..eed031df 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java @@ -1,5 +1,5 @@ /** - * <p>Clustering algorithms</p> + * <p>Clustering algorithms.</p> * * Clustering algorithms are supposed to implement the {@link de.lmu.ifi.dbs.elki.algorithm.Algorithm}-Interface. * The more specialized interface {@link de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm} @@ -15,7 +15,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index dfc4e1cd..e3b274a6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index 987c7eda..c4c1687b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 36473cc0..40ab60a8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 92c2248c..3f16e907 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java index 22e9c150..4ca5a564 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index c0edb2ea..963c0922 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -44,7 +44,7 @@ import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.AbstractDimensionsSelectingDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionsSelectingEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; @@ -94,7 +94,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus /** * The distance function to determine the distance between database objects. * <p> - * Default value: {@link DimensionsSelectingEuclideanDistanceFunction} + * Default value: {@link SubspaceEuclideanDistanceFunction} * </p> * <p> * Key: {@code -subclu.distancefunction} @@ -477,7 +477,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, DimensionsSelectingEuclideanDistanceFunction.class); + ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); if(config.grab(param)) { distance = param.instantiateClass(config); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java index 1874c9e8..eff71a35 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java index 4b6fa9ad..db687567 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; /** @@ -265,7 +266,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { resultIntervals.add(this.intervals.last()); resultIntervals.add(other.intervals.last()); - ModifiableDBIDs resultIDs = DBIDUtil.newHashSet(this.ids); + HashSetModifiableDBIDs resultIDs = DBIDUtil.newHashSet(this.ids); resultIDs.retainAll(other.ids); if(resultIDs.size() / all >= tau) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java index 444cb0e6..7a686190 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java index 168ceadb..2a1eb930 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java @@ -10,7 +10,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java index 02350db3..43c6a218 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -85,9 +85,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl public static final OptionID MULTIPLE_ID = OptionID.getOrCreateOptionID("bylabelclustering.multiple", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned."); /** - * Flag to indicate that multiple cluster assignment is possible. If an - * assignment to multiple clusters is desired, the labels indicating the - * clusters need to be separated by blanks. + * Pattern to recognize noise clusters by. */ public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("bylabelclustering.noise", "Pattern to recognize noise classes by their label."); @@ -144,7 +142,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl ModifiableDBIDs noiseids = DBIDUtil.newArray(); Clustering<Model> result = new Clustering<Model>("By Label Clustering", "bylabel-clustering"); for(Entry<String, ModifiableDBIDs> entry : labelMap.entrySet()) { - ModifiableDBIDs ids = labelMap.get(entry.getKey()); + ModifiableDBIDs ids = entry.getValue(); if(ids.size() <= 1) { noiseids.addDBIDs(ids); continue; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java index 5b8041d7..228cc7e7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java new file mode 100644 index 00000000..cd45cda2 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java @@ -0,0 +1,163 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Pseudo clustering using annotated models. + * + * This "algorithm" puts elements into the same cluster when they agree in their + * model. I.e. it just uses a predefined clustering, and is mostly useful for + * testing and evaluation (e.g. comparing the result of a real algorithm to the + * reference result / golden standard used by the generator). + * + * @author Erich Schubert + * + * @apiviz.uses Model + */ +@Title("Clustering by model") +@Description("Cluster points by a (pre-assigned!) model. For comparing results with a reference clustering.") +public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(ByModelClustering.class); + + /** + * Pattern to recognize noise clusters with + */ + public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("bymodel.noise", "Pattern to recognize noise models by their label."); + + /** + * Holds the value of {@link #NOISE_ID}. + */ + private Pattern noisepattern = null; + + /** + * Constructor. + * + * @param noisepattern Noise pattern + */ + public ByModelClustering(Pattern noisepattern) { + super(); + this.noisepattern = noisepattern; + } + + /** + * Constructor without parameters + */ + public ByModelClustering() { + this(null); + } + + /** + * Run the actual clustering algorithm. + * + * @param relation The data input we use + */ + public Clustering<Model> run(Relation<Model> relation) { + // Build model mapping + HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<Model, ModifiableDBIDs>(); + for(DBID id : relation.iterDBIDs()) { + Model model = relation.get(id); + ModifiableDBIDs modelids = modelMap.get(model); + if(modelids == null) { + modelids = DBIDUtil.newHashSet(); + modelMap.put(model, modelids); + } + modelids.add(id); + } + + Clustering<Model> result = new Clustering<Model>("By Model Clustering", "bymodel-clustering"); + for(Entry<Model, ModifiableDBIDs> entry : modelMap.entrySet()) { + final Model model = entry.getKey(); + final ModifiableDBIDs ids = entry.getValue(); + final String name = (model instanceof GeneratorInterface) ? ((GeneratorInterface) model).getName() : model.toString(); + Cluster<Model> c = new Cluster<Model>(name, ids, model); + if(noisepattern != null && noisepattern.matcher(name).find()) { + c.setNoise(true); + } + result.addCluster(c); + } + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.MODEL); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + protected Pattern noisepat; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter noisepatP = new PatternParameter(NOISE_ID, true); + if(config.grab(noisepatP)) { + noisepat = noisepatP.getValue(); + } + } + + @Override + protected ByModelClustering makeInstance() { + return new ByModelClustering(noisepat); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java index a316ce57..2e7d006d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java index b85f5445..c497632c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java index 5629855c..5870a736 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |