diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/em')
11 files changed, 1530 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java new file mode 100644 index 00000000..605c3aed --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java @@ -0,0 +1,89 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.RandomlyGeneratedInitialMeans; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Abstract base class for initializing EM. + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <M> Model type + */ +public abstract class AbstractEMModelFactory<V extends NumberVector, M extends MeanModel> implements EMClusterModelFactory<V, M> { + /** + * Class to choose the initial means + */ + protected KMeansInitialization<V> initializer; + + /** + * Constructor. + * + * @param initializer Class for choosing the initial seeds. + */ + public AbstractEMModelFactory(KMeansInitialization<V> initializer) { + super(); + this.initializer = initializer; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> vector type + */ + public abstract static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * Parameter to specify the cluster center initialization. + */ + public static final OptionID INIT_ID = new OptionID("em.centers", // + "Method to choose the initial cluster centers."); + + /** + * Initialization method + */ + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java new file mode 100644 index 00000000..4ba9290f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java @@ -0,0 +1,200 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.Arrays; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Simpler model for a single Gaussian cluster, without covariances. + * + * @author Erich Schubert + */ +public class DiagonalGaussianModel implements EMClusterModel<EMModel> { + /** + * Class logger. + */ + private static Logging LOG = Logging.getLogger(DiagonalGaussianModel.class); + + /** + * Mean vector. + */ + Vector mean; + + /** + * Per-dimension variances. + */ + double[] variances; + + /** + * Temporary storage, to avoid reallocations. + */ + double[] nmea, mref; + + /** + * Normalization factor. + */ + double norm, normDistrFactor; + + /** + * Weight aggregation sum + */ + double weight, wsum; + + /** + * Constructor. + * + * @param weight Cluster weight + * @param mean Initial mean + */ + public DiagonalGaussianModel(double weight, Vector mean) { + this(weight, mean, MathUtil.powi(MathUtil.TWOPI, mean.getDimensionality())); + } + + /** + * Constructor. + * + * @param weight Cluster weight + * @param mean Initial mean + * @param norm Normalization factor. + */ + public DiagonalGaussianModel(double weight, Vector mean, double norm) { + this.weight = weight; + final int dim = mean.getDimensionality(); + this.mean = mean; + this.norm = norm; + this.normDistrFactor = 1. / Math.sqrt(norm); // assume det=1 + this.mref = mean.getArrayRef(); + this.nmea = new double[dim]; + this.variances = new double[dim]; + Arrays.fill(variances, 1.); + this.wsum = 0.; + } + + @Override + public void beginEStep() { + wsum = 0.; + } + + @Override + public void updateE(NumberVector vec, double wei) { + assert (vec.getDimensionality() == mref.length); + final double nwsum = wsum + wei; + // Compute new means + for(int i = 0; i < mref.length; i++) { + final double delta = vec.doubleValue(i) - mref[i]; + final double rval = delta * wei / nwsum; + nmea[i] = mref[i] + rval; + } + // Update variances + for(int i = 0; i < mref.length; i++) { + // We DO want to use the new mean once and the old mean once! + // It does not matter which one is which. + variances[i] += (vec.doubleValue(i) - nmea[i]) * (vec.doubleValue(i) - mref[i]) * wei; + } + // Use new values. + wsum = nwsum; + System.arraycopy(nmea, 0, mref, 0, nmea.length); + } + + @Override + public void finalizeEStep() { + if(wsum > 0.) { + final double s = 1. / wsum; + double det = 1.; + for(int i = 0; i < variances.length; i++) { + double v = variances[i]; + v = v > 0 ? v * s : Matrix.SINGULARITY_CHEAT; + variances[i] = v; + det *= v; + } + normDistrFactor = 1. / Math.sqrt(norm * det); + } + else { + // Degenerate + normDistrFactor = 1. / Math.sqrt(norm); + } + } + + /** + * Compute the Mahalanobis distance from the centroid for a given vector. + * + * @param vec Vector + * @return Mahalanobis distance + */ + public double mahalanobisDistance(Vector vec) { + double[] difference = vec.minus(mean).getArrayRef(); + double agg = 0.; + for(int i = 0; i < variances.length; i++) { + agg += difference[i] / variances[i] * difference[i]; + } + return agg; + } + + /** + * Compute the Mahalanobis distance from the centroid for a given vector. + * + * @param vec Vector + * @return Mahalanobis distance + */ + public double mahalanobisDistance(NumberVector vec) { + double[] difference = vec.getColumnVector().minusEquals(mean).getArrayRef(); + double agg = 0.; + for(int i = 0; i < variances.length; i++) { + agg += difference[i] / variances[i] * difference[i]; + } + return agg; + } + + @Override + public double estimateDensity(NumberVector vec) { + double power = mahalanobisDistance(vec) * .5; + double prob = normDistrFactor * Math.exp(-power); + if(!(prob >= 0.)) { + LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor); + prob = 0.; + } + return prob * weight; + } + + @Override + public double getWeight() { + return weight; + } + + @Override + public void setWeight(double weight) { + this.weight = weight; + } + + @Override + public EMModel finalizeCluster() { + return new EMModel(mean, Matrix.diagonal(variances)); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java new file mode 100644 index 00000000..3ac02dae --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java @@ -0,0 +1,88 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Factory for EM with multivariate gaussian models using diagonal matrixes. + * + * These models have individual variances, but no covariance, so this + * corresponds to the {@code 'VVI'} model in Mclust (R). + * + * @author Erich Schubert + * + * @apiviz.has DiagonalGaussianModel + * + * @param <V> vector type + */ +public class DiagonalGaussianModelFactory<V extends NumberVector> extends AbstractEMModelFactory<V, EMModel> { + /** + * Constructor. + * + * @param initializer Class for choosing the inital seeds. + */ + public DiagonalGaussianModelFactory(KMeansInitialization<V> initializer) { + super(initializer); + } + + @Override + public List<DiagonalGaussianModel> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df) { + final List<Vector> initialMeans = initializer.chooseInitialMeans(database, relation, k, df, Vector.FACTORY); + assert (initialMeans.size() == k); + final int dimensionality = initialMeans.get(0).getDimensionality(); + final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality); + List<DiagonalGaussianModel> models = new ArrayList<>(k); + for(Vector nv : initialMeans) { + models.add(new DiagonalGaussianModel(1. / k, nv, norm)); + } + return models; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector> extends AbstractEMModelFactory.Parameterizer<V> { + @Override + protected DiagonalGaussianModelFactory<V> makeInstance() { + return new DiagonalGaussianModelFactory<>(initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java new file mode 100644 index 00000000..076c819b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java @@ -0,0 +1,414 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Clustering by expectation maximization (EM-Algorithm), also known as Gaussian + * Mixture Modeling (GMM). + * + * <p> + * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin:<br /> + * Maximum Likelihood from Incomplete Data via the EM algorithm.<br> + * In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31 + * </p> + * + * @author Arthur Zimek + * @author Erich Schubert + * + * @apiviz.composedOf EMClusterModelFactory + * + * @param <V> vector type to analyze + * @param <M> model type to produce + */ +@Title("EM-Clustering: Clustering by Expectation Maximization") +@Description("Cluster data via Gaussian mixture modeling and the EM algorithm") +@Reference(authors = "A. P. Dempster, N. M. Laird, D. B. Rubin", // +title = "Maximum Likelihood from Incomplete Data via the EM algorithm", // +booktitle = "Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31", // +url = "http://www.jstor.org/stable/2984875") +@Alias({ "de.lmu.ifi.dbs.elki.algorithm.clustering.EM", "EM" }) +public class EM<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(EM.class); + + /** + * Number of clusters + */ + private int k; + + /** + * Delta parameter + */ + private double delta; + + /** + * Factory for producing the initial cluster model. + */ + private EMClusterModelFactory<V, M> mfactory; + + /** + * Maximum number of iterations to allow + */ + private int maxiter; + + /** + * Retain soft assignments. + */ + private boolean soft; + + private static final double MIN_LOGLIKELIHOOD = -100000; + + /** + * Soft assignment result type. + */ + public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class); + + /** + * Constructor. + * + * @param k k parameter + * @param delta delta parameter + * @param mfactory EM cluster model factory + * @param maxiter Maximum number of iterations + * @param soft Include soft assignments + */ + public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, boolean soft) { + super(); + this.k = k; + this.delta = delta; + this.mfactory = mfactory; + this.maxiter = maxiter; + this.setSoft(soft); + } + + /** + * Performs the EM clustering algorithm on the given database. + * <p/> + * Finally a hard clustering is provided where each clusters gets assigned the + * points exhibiting the highest probability to belong to this cluster. But + * still, the database objects hold associated the complete probability-vector + * for all models. + * + * @param database Database + * @param relation Relation + * @return Result + */ + public Clustering<M> run(Database database, Relation<V> relation) { + if(relation.size() == 0) { + throw new IllegalArgumentException("database empty: must contain elements"); + } + // initial models + if(LOG.isVerbose()) { + LOG.verbose("initializing " + k + " models"); + } + List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC); + WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); + double emNew = assignProbabilitiesToInstances(relation, models, probClusterIGivenX); + + // iteration unless no change + if(LOG.isVerbose()) { + LOG.verbose("iterating EM"); + } + if(LOG.isVerbose()) { + LOG.verbose("iteration " + 0 + " - expectation value: " + emNew); + } + + for(int it = 1; it <= maxiter || maxiter < 0; it++) { + final double emOld = emNew; + recomputeCovarianceMatrices(relation, probClusterIGivenX, models); + // reassign probabilities + emNew = assignProbabilitiesToInstances(relation, models, probClusterIGivenX); + + if(LOG.isVerbose()) { + LOG.verbose("iteration " + it + " - expectation value: " + emNew); + } + if(Math.abs(emOld - emNew) <= delta) { + break; + } + } + + if(LOG.isVerbose()) { + LOG.verbose("assigning clusters"); + } + + // fill result with clusters and models + List<ModifiableDBIDs> hardClusters = new ArrayList<>(k); + for(int i = 0; i < k; i++) { + hardClusters.add(DBIDUtil.newHashSet()); + } + + // provide a hard clustering + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); + int maxIndex = 0; + double currentMax = 0.0; + for(int i = 0; i < k; i++) { + if(clusterProbabilities[i] > currentMax) { + maxIndex = i; + currentMax = clusterProbabilities[i]; + } + } + hardClusters.get(maxIndex).add(iditer); + } + Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering"); + // provide models within the result + for(int i = 0; i < k; i++) { + // TODO: re-do labeling. + // SimpleClassLabel label = new SimpleClassLabel(); + // label.init(result.canonicalClusterLabel(i)); + Cluster<M> model = new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()); + result.addToplevelCluster(model); + } + if(isSoft()) { + result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs())); + } + else { + probClusterIGivenX.destroy(); + } + return result; + } + + /** + * Recompute the covariance matrixes. + * + * @param relation Vector data + * @param probClusterIGivenX Object probabilities + * @param models Cluster models to update + */ + public static void recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models) { + for(EMClusterModel<?> m : models) { + m.beginEStep(); + } + double[] wsum = new double[models.size()]; + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); + NumberVector instance = relation.get(iditer); + int i = 0; + for(EMClusterModel<?> m : models) { + final double prior = clusterProbabilities[i]; + if(prior > 0.) { + m.updateE(instance, prior); + } + wsum[i] += prior; + ++i; + } + } + int i = 0; + for(EMClusterModel<?> m : models) { + m.finalizeEStep(); + m.setWeight(wsum[i] / relation.size()); + i++; + } + } + + /** + * Assigns the current probability values to the instances in the database and + * compute the expectation value of the current mixture of distributions. + * + * Computed as the sum of the logarithms of the prior probability of each + * instance. + * + * @param relation the database used for assignment to instances + * @param models Cluster models + * @param probClusterIGivenX Output storage for cluster probabilities + * @return the expectation value of the current mixture of distributions + */ + public static double assignProbabilitiesToInstances(Relation<? extends NumberVector> relation, List<? extends EMClusterModel<?>> models, WritableDataStore<double[]> probClusterIGivenX) { + final int k = models.size(); + double emSum = 0.; + + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + NumberVector vec = relation.get(iditer); + double[] probabilities = new double[k]; + { + int i = 0; + for(EMClusterModel<?> m : models) { + probabilities[i] = m.estimateDensity(vec); + ++i; + } + } + double priorProbability = 0.; + for(int i = 0; i < k; i++) { + priorProbability += probabilities[i]; + } + double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD); + emSum += (logP == logP) ? logP : 0.; /* avoid NaN */ + + double[] clusterProbabilities = new double[k]; + if(priorProbability > 0.) { + for(int i = 0; i < k; i++) { + // do not divide by zero! + clusterProbabilities[i] = probabilities[i] / priorProbability; + } + } + probClusterIGivenX.put(iditer, clusterProbabilities); + } + + return emSum / relation.size(); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * @return the soft + */ + public boolean isSoft() { + return soft; + } + + /** + * @param soft the soft to set + */ + public void setSoft(boolean soft) { + this.soft = soft; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer { + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find."); + + /** + * Parameter to specify the termination criterion for maximization of E(M): + * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. + */ + public static final OptionID DELTA_ID = new OptionID("em.delta", // + "The termination criterion for maximization of E(M): " + // + "E(M) - E(M') < em.delta"); + + /** + * Parameter to specify the EM cluster models to use. + */ + public static final OptionID INIT_ID = new OptionID("em.model", // + "Model factory."); + + /** + * Number of clusters. + */ + protected int k; + + /** + * Stopping threshold + */ + protected double delta; + + /** + * Initialization method + */ + protected EMClusterModelFactory<V, M> initializer; + + /** + * Maximum number of iterations. + */ + protected int maxiter = -1; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<EMClusterModelFactory<V, M>> initialP = new ObjectParameter<>(INIT_ID, EMClusterModelFactory.class, MultivariateGaussianModelFactory.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.)// + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(deltaP)) { + delta = deltaP.getValue(); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID)// + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT) // + .setOptional(true); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected EM<V, M> makeInstance() { + return new EM<>(k, delta, initializer, maxiter, false); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java new file mode 100644 index 00000000..f08e6444 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java @@ -0,0 +1,81 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; + +/** + * Models useable in EM clustering. + * + * @author Erich Schubert + */ +public interface EMClusterModel<M extends MeanModel> { + /** + * Begin the E step. + */ + void beginEStep(); + + /** + * Update the + * + * @param vec Vector to process + * @param weight Weight + */ + void updateE(NumberVector vec, double weight); + + /** + * Finalize the E step. + */ + void finalizeEStep(); + + /** + * Estimate the likelihood of a vector. + * + * @param vec Vector + * @return Likelihood. + */ + double estimateDensity(NumberVector vec); + + /** + * Finalize a cluster model. + * + * @return Cluster model + */ + M finalizeCluster(); + + /** + * Get the cluster weight. + * + * @return Cluster weight + */ + double getWeight(); + + /** + * Set the cluster weight. + * + * @param weight Cluster weight + */ + void setWeight(double weight); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java new file mode 100644 index 00000000..223ebcd6 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java @@ -0,0 +1,52 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; + +/** + * Factory for initializing the EM models. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public interface EMClusterModelFactory<V extends NumberVector, M extends MeanModel> { + /** + * Build the initial models + * + * @param database Database + * @param relation Relation + * @param k Number of clusters + * @param df Distance function + * @return Initial models + */ + List<? extends EMClusterModel<M>> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java new file mode 100644 index 00000000..e134edff --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java @@ -0,0 +1,212 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.LUDecomposition; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Model for a single Gaussian cluster. + * + * @author Erich Schubert + */ +public class MultivariateGaussianModel implements EMClusterModel<EMModel> { + /** + * Class logger. + */ + private static Logging LOG = Logging.getLogger(MultivariateGaussianModel.class); + + /** + * Mean vector. + */ + Vector mean; + + /** + * Covariance matrix, and inverse. + */ + Matrix covariance, invCovMatr; + + /** + * Temporary storage, to avoid reallocations. + */ + double[] nmea, mref; + + /** + * Matrix element reference. + */ + double[][] elements; + + /** + * Normalization factor. + */ + double norm, normDistrFactor; + + /** + * Weight aggregation sum + */ + double weight, wsum; + + /** + * Constructor. + * + * @param weight Cluster weight + * @param mean Initial mean + */ + public MultivariateGaussianModel(double weight, Vector mean) { + this(weight, mean, MathUtil.powi(MathUtil.TWOPI, mean.getDimensionality())); + } + + /** + * Constructor. + * + * @param weight Cluster weight + * @param mean Initial mean + * @param norm Normalization factor. + */ + public MultivariateGaussianModel(double weight, Vector mean, double norm) { + this.weight = weight; + final int dim = mean.getDimensionality(); + this.mean = mean; + this.norm = norm; + this.normDistrFactor = 1. / Math.sqrt(norm); // assume det=1 + this.mref = mean.getArrayRef(); + this.nmea = new double[dim]; + this.covariance = new Matrix(dim, dim); + this.elements = this.covariance.getArrayRef(); + this.wsum = 0.; + } + + @Override + public void beginEStep() { + if(covariance == null) { + covariance = new Matrix(mean.getDimensionality(), mean.getDimensionality()); + return; + } + wsum = 0.; + } + + @Override + public void updateE(NumberVector vec, double wei) { + assert (vec.getDimensionality() == mref.length); + final double nwsum = wsum + wei; + // Compute new means + for(int i = 0; i < mref.length; i++) { + final double delta = vec.doubleValue(i) - mref[i]; + final double rval = delta * wei / nwsum; + nmea[i] = mref[i] + rval; + } + // Update covariance matrix + for(int i = 0; i < mref.length; i++) { + for(int j = i; j < mref.length; j++) { + // We DO want to use the new mean once and the old mean once! + // It does not matter which one is which. + double delta = (vec.doubleValue(i) - nmea[i]) * (vec.doubleValue(j) - mref[j]) * wei; + elements[i][j] = elements[i][j] + delta; + // Optimize via symmetry + if(i != j) { + elements[j][i] = elements[j][i] + delta; + } + } + } + // Use new values. + wsum = nwsum; + System.arraycopy(nmea, 0, mref, 0, nmea.length); + } + + @Override + public void finalizeEStep() { + final int dim = mean.getDimensionality(); + // TODO: improve handling of degenerated cases? + if(wsum > 0.) { + covariance.timesEquals(1. / wsum); + } + LUDecomposition lu = new LUDecomposition(covariance); + double det = lu.det(); + if(det <= 0.) { + // Add a small value to the diagonal + covariance.plusDiagonalEquals(Matrix.SINGULARITY_CHEAT); + lu = new LUDecomposition(covariance); // Should no longer be zero now. + det = lu.det(); + assert (det > 0) : "Singularity cheat did not resolve zero determinant."; + } + normDistrFactor = 1. / Math.sqrt(norm * det); + invCovMatr = lu.solve(Matrix.identity(dim, dim)); + } + + /** + * Compute the Mahalanobis distance from the centroid for a given vector. + * + * @param vec Vector + * @return Mahalanobis distance + */ + public double mahalanobisDistance(Vector vec) { + if (invCovMatr != null) { + return VMath.mahalanobisDistance(invCovMatr.getArrayRef(), vec.getArrayRef(), mref); + } + Vector difference = vec.minus(mean); + return difference.transposeTimes(difference); + } + + /** + * Compute the Mahalanobis distance from the centroid for a given vector. + * + * @param vec Vector + * @return Mahalanobis distance + */ + public double mahalanobisDistance(NumberVector vec) { + Vector difference = vec.getColumnVector().minusEquals(mean); + return (invCovMatr != null) ? difference.transposeTimesTimes(invCovMatr, difference) : difference.transposeTimes(difference); + } + + @Override + public double estimateDensity(NumberVector vec) { + double power = mahalanobisDistance(vec) * .5; + double prob = normDistrFactor * Math.exp(-power); + if(!(prob >= 0.)) { + LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor); + prob = 0.; + } + return prob * weight; + } + + @Override + public double getWeight() { + return weight; + } + + @Override + public void setWeight(double weight) { + this.weight = weight; + } + + @Override + public EMModel finalizeCluster() { + return new EMModel(mean, covariance); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java new file mode 100644 index 00000000..ca0b7db4 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java @@ -0,0 +1,89 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Factory for EM with multivariate gaussian models (with covariance; also known + * as Gaussian Mixture Modeling, GMM). + * + * These models have individual covariance matrixes, so this corresponds to the + * {@code 'VVV'} model in Mclust (R). + * + * @author Erich Schubert + * + * @apiviz.has MultivariateGaussianModel + * + * @param <V> vector type + */ +public class MultivariateGaussianModelFactory<V extends NumberVector> extends AbstractEMModelFactory<V, EMModel> { + /** + * Constructor. + * + * @param initializer Class for choosing the inital seeds. + */ + public MultivariateGaussianModelFactory(KMeansInitialization<V> initializer) { + super(initializer); + } + + @Override + public List<MultivariateGaussianModel> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df) { + final List<Vector> initialMeans = initializer.chooseInitialMeans(database, relation, k, df, Vector.FACTORY); + assert (initialMeans.size() == k); + final int dimensionality = initialMeans.get(0).getDimensionality(); + final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality); + List<MultivariateGaussianModel> models = new ArrayList<>(k); + for(Vector nv : initialMeans) { + models.add(new MultivariateGaussianModel(1. / k, nv, norm)); + } + return models; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector> extends AbstractEMModelFactory.Parameterizer<V> { + @Override + protected MultivariateGaussianModelFactory<V> makeInstance() { + return new MultivariateGaussianModelFactory<>(initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java new file mode 100644 index 00000000..b1d62a17 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java @@ -0,0 +1,190 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Simple spherical Gaussian cluster. + * + * @author Erich Schubert + */ +public class SphericalGaussianModel implements EMClusterModel<EMModel> { + /** + * Class logger. + */ + private static Logging LOG = Logging.getLogger(SphericalGaussianModel.class); + + /** + * Mean vector. + */ + Vector mean; + + /** + * Variances. + */ + double variance; + + /** + * Temporary storage, to avoid reallocations. + */ + double[] nmea, mref; + + /** + * Normalization factor. + */ + double norm, normDistrFactor; + + /** + * Weight aggregation sum + */ + double weight, wsum; + + /** + * Constructor. + * + * @param weight Cluster weight + * @param mean Initial mean + */ + public SphericalGaussianModel(double weight, Vector mean) { + this(weight, mean, MathUtil.powi(MathUtil.TWOPI, mean.getDimensionality())); + } + + /** + * Constructor. + * + * @param weight Cluster weight + * @param mean Initial mean + * @param norm Normalization factor. + */ + public SphericalGaussianModel(double weight, Vector mean, double norm) { + this.weight = weight; + final int dim = mean.getDimensionality(); + this.mean = mean; + this.norm = norm; + this.normDistrFactor = 1. / Math.sqrt(norm); // assume det=1 + this.mref = mean.getArrayRef(); + this.nmea = new double[dim]; + this.variance = 1.; + this.wsum = 0.; + } + + @Override + public void beginEStep() { + wsum = 0.; + } + + @Override + public void updateE(NumberVector vec, double wei) { + assert (vec.getDimensionality() == mref.length); + final double nwsum = wsum + wei; + // Compute new means + for(int i = 0; i < mref.length; i++) { + final double delta = vec.doubleValue(i) - mref[i]; + final double rval = delta * wei / nwsum; + nmea[i] = mref[i] + rval; + } + // Update variances + for(int i = 0; i < mref.length; i++) { + // We DO want to use the new mean once and the old mean once! + // It does not matter which one is which. + variance += (vec.doubleValue(i) - nmea[i]) * (vec.doubleValue(i) - mref[i]) * wei; + } + // Use new values. + wsum = nwsum; + System.arraycopy(nmea, 0, mref, 0, nmea.length); + } + + @Override + public void finalizeEStep() { + if(wsum > 0.) { + variance = variance / (wsum * mref.length); + normDistrFactor = 1. / Math.sqrt(norm * variance); + } + else { + // Degenerate + normDistrFactor = 1. / Math.sqrt(norm); + } + } + + /** + * Compute the Mahalanobis distance from the centroid for a given vector. + * + * @param vec Vector + * @return Mahalanobis distance + */ + public double mahalanobisDistance(Vector vec) { + double[] difference = vec.minus(mean).getArrayRef(); + double agg = 0.; + for(int i = 0; i < difference.length; i++) { + agg += difference[i] / variance * difference[i]; + } + return agg; + } + + /** + * Compute the Mahalanobis distance from the centroid for a given vector. + * + * @param vec Vector + * @return Mahalanobis distance + */ + public double mahalanobisDistance(NumberVector vec) { + double[] difference = vec.getColumnVector().minusEquals(mean).getArrayRef(); + double agg = 0.; + for(int i = 0; i < difference.length; i++) { + agg += difference[i] / variance * difference[i]; + } + return agg; + } + + @Override + public double estimateDensity(NumberVector vec) { + double power = mahalanobisDistance(vec) * .5; + double prob = normDistrFactor * Math.exp(-power); + if(!(prob >= 0.)) { + LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor); + prob = 0.; + } + return prob * weight; + } + + @Override + public double getWeight() { + return weight; + } + + @Override + public void setWeight(double weight) { + this.weight = weight; + } + + @Override + public EMModel finalizeCluster() { + return new EMModel(mean, Matrix.identity(nmea.length, nmea.length).timesEquals(variance)); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java new file mode 100644 index 00000000..7de77188 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java @@ -0,0 +1,88 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Factory for EM with multivariate gaussian models using diagonal matrixes. + * + * These models have individual variances, but no covariance, so this + * corresponds to the {@code 'VVI'} model in Mclust (R). + * + * @author Erich Schubert + * + * @apiviz.has SphericalGaussianModel + * + * @param <V> vector type + */ +public class SphericalGaussianModelFactory<V extends NumberVector> extends AbstractEMModelFactory<V, EMModel> { + /** + * Constructor. + * + * @param initializer Class for choosing the inital seeds. + */ + public SphericalGaussianModelFactory(KMeansInitialization<V> initializer) { + super(initializer); + } + + @Override + public List<SphericalGaussianModel> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df) { + final List<Vector> initialMeans = initializer.chooseInitialMeans(database, relation, k, df, Vector.FACTORY); + assert (initialMeans.size() == k); + final int dimensionality = initialMeans.get(0).getDimensionality(); + final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality); + List<SphericalGaussianModel> models = new ArrayList<>(k); + for(Vector nv : initialMeans) { + models.add(new SphericalGaussianModel(1. / k, nv, norm)); + } + return models; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector> extends AbstractEMModelFactory.Parameterizer<V> { + @Override + protected SphericalGaussianModelFactory<V> makeInstance() { + return new SphericalGaussianModelFactory<>(initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java new file mode 100644 index 00000000..22f0b669 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java @@ -0,0 +1,27 @@ +/** + * Expectation-Maximization clustering algorithm. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
\ No newline at end of file |