summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/em')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java89
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java200
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java88
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java414
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java81
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java52
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java212
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java89
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java190
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java88
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java27
11 files changed, 1530 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java
new file mode 100644
index 00000000..605c3aed
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/AbstractEMModelFactory.java
@@ -0,0 +1,89 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.RandomlyGeneratedInitialMeans;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Abstract base class for initializing EM.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <M> Model type
+ */
+public abstract class AbstractEMModelFactory<V extends NumberVector, M extends MeanModel> implements EMClusterModelFactory<V, M> {
+ /**
+ * Class to choose the initial means
+ */
+ protected KMeansInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param initializer Class for choosing the initial seeds.
+ */
+ public AbstractEMModelFactory(KMeansInitialization<V> initializer) {
+ super();
+ this.initializer = initializer;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> vector type
+ */
+ public abstract static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the cluster center initialization.
+ */
+ public static final OptionID INIT_ID = new OptionID("em.centers", //
+ "Method to choose the initial cluster centers.");
+
+ /**
+ * Initialization method
+ */
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java
new file mode 100644
index 00000000..4ba9290f
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModel.java
@@ -0,0 +1,200 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Simpler model for a single Gaussian cluster, without covariances.
+ *
+ * @author Erich Schubert
+ */
+public class DiagonalGaussianModel implements EMClusterModel<EMModel> {
+ /**
+ * Class logger.
+ */
+ private static Logging LOG = Logging.getLogger(DiagonalGaussianModel.class);
+
+ /**
+ * Mean vector.
+ */
+ Vector mean;
+
+ /**
+ * Per-dimension variances.
+ */
+ double[] variances;
+
+ /**
+ * Temporary storage, to avoid reallocations.
+ */
+ double[] nmea, mref;
+
+ /**
+ * Normalization factor.
+ */
+ double norm, normDistrFactor;
+
+ /**
+ * Weight aggregation sum
+ */
+ double weight, wsum;
+
+ /**
+ * Constructor.
+ *
+ * @param weight Cluster weight
+ * @param mean Initial mean
+ */
+ public DiagonalGaussianModel(double weight, Vector mean) {
+ this(weight, mean, MathUtil.powi(MathUtil.TWOPI, mean.getDimensionality()));
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param weight Cluster weight
+ * @param mean Initial mean
+ * @param norm Normalization factor.
+ */
+ public DiagonalGaussianModel(double weight, Vector mean, double norm) {
+ this.weight = weight;
+ final int dim = mean.getDimensionality();
+ this.mean = mean;
+ this.norm = norm;
+ this.normDistrFactor = 1. / Math.sqrt(norm); // assume det=1
+ this.mref = mean.getArrayRef();
+ this.nmea = new double[dim];
+ this.variances = new double[dim];
+ Arrays.fill(variances, 1.);
+ this.wsum = 0.;
+ }
+
+ @Override
+ public void beginEStep() {
+ wsum = 0.;
+ }
+
+ @Override
+ public void updateE(NumberVector vec, double wei) {
+ assert (vec.getDimensionality() == mref.length);
+ final double nwsum = wsum + wei;
+ // Compute new means
+ for(int i = 0; i < mref.length; i++) {
+ final double delta = vec.doubleValue(i) - mref[i];
+ final double rval = delta * wei / nwsum;
+ nmea[i] = mref[i] + rval;
+ }
+ // Update variances
+ for(int i = 0; i < mref.length; i++) {
+ // We DO want to use the new mean once and the old mean once!
+ // It does not matter which one is which.
+ variances[i] += (vec.doubleValue(i) - nmea[i]) * (vec.doubleValue(i) - mref[i]) * wei;
+ }
+ // Use new values.
+ wsum = nwsum;
+ System.arraycopy(nmea, 0, mref, 0, nmea.length);
+ }
+
+ @Override
+ public void finalizeEStep() {
+ if(wsum > 0.) {
+ final double s = 1. / wsum;
+ double det = 1.;
+ for(int i = 0; i < variances.length; i++) {
+ double v = variances[i];
+ v = v > 0 ? v * s : Matrix.SINGULARITY_CHEAT;
+ variances[i] = v;
+ det *= v;
+ }
+ normDistrFactor = 1. / Math.sqrt(norm * det);
+ }
+ else {
+ // Degenerate
+ normDistrFactor = 1. / Math.sqrt(norm);
+ }
+ }
+
+ /**
+ * Compute the Mahalanobis distance from the centroid for a given vector.
+ *
+ * @param vec Vector
+ * @return Mahalanobis distance
+ */
+ public double mahalanobisDistance(Vector vec) {
+ double[] difference = vec.minus(mean).getArrayRef();
+ double agg = 0.;
+ for(int i = 0; i < variances.length; i++) {
+ agg += difference[i] / variances[i] * difference[i];
+ }
+ return agg;
+ }
+
+ /**
+ * Compute the Mahalanobis distance from the centroid for a given vector.
+ *
+ * @param vec Vector
+ * @return Mahalanobis distance
+ */
+ public double mahalanobisDistance(NumberVector vec) {
+ double[] difference = vec.getColumnVector().minusEquals(mean).getArrayRef();
+ double agg = 0.;
+ for(int i = 0; i < variances.length; i++) {
+ agg += difference[i] / variances[i] * difference[i];
+ }
+ return agg;
+ }
+
+ @Override
+ public double estimateDensity(NumberVector vec) {
+ double power = mahalanobisDistance(vec) * .5;
+ double prob = normDistrFactor * Math.exp(-power);
+ if(!(prob >= 0.)) {
+ LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor);
+ prob = 0.;
+ }
+ return prob * weight;
+ }
+
+ @Override
+ public double getWeight() {
+ return weight;
+ }
+
+ @Override
+ public void setWeight(double weight) {
+ this.weight = weight;
+ }
+
+ @Override
+ public EMModel finalizeCluster() {
+ return new EMModel(mean, Matrix.diagonal(variances));
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java
new file mode 100644
index 00000000..3ac02dae
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/DiagonalGaussianModelFactory.java
@@ -0,0 +1,88 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Factory for EM with multivariate gaussian models using diagonal matrixes.
+ *
+ * These models have individual variances, but no covariance, so this
+ * corresponds to the {@code 'VVI'} model in Mclust (R).
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has DiagonalGaussianModel
+ *
+ * @param <V> vector type
+ */
+public class DiagonalGaussianModelFactory<V extends NumberVector> extends AbstractEMModelFactory<V, EMModel> {
+ /**
+ * Constructor.
+ *
+ * @param initializer Class for choosing the inital seeds.
+ */
+ public DiagonalGaussianModelFactory(KMeansInitialization<V> initializer) {
+ super(initializer);
+ }
+
+ @Override
+ public List<DiagonalGaussianModel> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df) {
+ final List<Vector> initialMeans = initializer.chooseInitialMeans(database, relation, k, df, Vector.FACTORY);
+ assert (initialMeans.size() == k);
+ final int dimensionality = initialMeans.get(0).getDimensionality();
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality);
+ List<DiagonalGaussianModel> models = new ArrayList<>(k);
+ for(Vector nv : initialMeans) {
+ models.add(new DiagonalGaussianModel(1. / k, nv, norm));
+ }
+ return models;
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractEMModelFactory.Parameterizer<V> {
+ @Override
+ protected DiagonalGaussianModelFactory<V> makeInstance() {
+ return new DiagonalGaussianModelFactory<>(initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java
new file mode 100644
index 00000000..076c819b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EM.java
@@ -0,0 +1,414 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeans;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Clustering by expectation maximization (EM-Algorithm), also known as Gaussian
+ * Mixture Modeling (GMM).
+ *
+ * <p>
+ * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin:<br />
+ * Maximum Likelihood from Incomplete Data via the EM algorithm.<br>
+ * In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31
+ * </p>
+ *
+ * @author Arthur Zimek
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf EMClusterModelFactory
+ *
+ * @param <V> vector type to analyze
+ * @param <M> model type to produce
+ */
+@Title("EM-Clustering: Clustering by Expectation Maximization")
+@Description("Cluster data via Gaussian mixture modeling and the EM algorithm")
+@Reference(authors = "A. P. Dempster, N. M. Laird, D. B. Rubin", //
+title = "Maximum Likelihood from Incomplete Data via the EM algorithm", //
+booktitle = "Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31", //
+url = "http://www.jstor.org/stable/2984875")
+@Alias({ "de.lmu.ifi.dbs.elki.algorithm.clustering.EM", "EM" })
+public class EM<V extends NumberVector, M extends MeanModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(EM.class);
+
+ /**
+ * Number of clusters
+ */
+ private int k;
+
+ /**
+ * Delta parameter
+ */
+ private double delta;
+
+ /**
+ * Factory for producing the initial cluster model.
+ */
+ private EMClusterModelFactory<V, M> mfactory;
+
+ /**
+ * Maximum number of iterations to allow
+ */
+ private int maxiter;
+
+ /**
+ * Retain soft assignments.
+ */
+ private boolean soft;
+
+ private static final double MIN_LOGLIKELIHOOD = -100000;
+
+ /**
+ * Soft assignment result type.
+ */
+ public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class);
+
+ /**
+ * Constructor.
+ *
+ * @param k k parameter
+ * @param delta delta parameter
+ * @param mfactory EM cluster model factory
+ * @param maxiter Maximum number of iterations
+ * @param soft Include soft assignments
+ */
+ public EM(int k, double delta, EMClusterModelFactory<V, M> mfactory, int maxiter, boolean soft) {
+ super();
+ this.k = k;
+ this.delta = delta;
+ this.mfactory = mfactory;
+ this.maxiter = maxiter;
+ this.setSoft(soft);
+ }
+
+ /**
+ * Performs the EM clustering algorithm on the given database.
+ * <p/>
+ * Finally a hard clustering is provided where each clusters gets assigned the
+ * points exhibiting the highest probability to belong to this cluster. But
+ * still, the database objects hold associated the complete probability-vector
+ * for all models.
+ *
+ * @param database Database
+ * @param relation Relation
+ * @return Result
+ */
+ public Clustering<M> run(Database database, Relation<V> relation) {
+ if(relation.size() == 0) {
+ throw new IllegalArgumentException("database empty: must contain elements");
+ }
+ // initial models
+ if(LOG.isVerbose()) {
+ LOG.verbose("initializing " + k + " models");
+ }
+ List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ double emNew = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
+
+ // iteration unless no change
+ if(LOG.isVerbose()) {
+ LOG.verbose("iterating EM");
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("iteration " + 0 + " - expectation value: " + emNew);
+ }
+
+ for(int it = 1; it <= maxiter || maxiter < 0; it++) {
+ final double emOld = emNew;
+ recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
+ // reassign probabilities
+ emNew = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("iteration " + it + " - expectation value: " + emNew);
+ }
+ if(Math.abs(emOld - emNew) <= delta) {
+ break;
+ }
+ }
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("assigning clusters");
+ }
+
+ // fill result with clusters and models
+ List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
+ for(int i = 0; i < k; i++) {
+ hardClusters.add(DBIDUtil.newHashSet());
+ }
+
+ // provide a hard clustering
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
+ int maxIndex = 0;
+ double currentMax = 0.0;
+ for(int i = 0; i < k; i++) {
+ if(clusterProbabilities[i] > currentMax) {
+ maxIndex = i;
+ currentMax = clusterProbabilities[i];
+ }
+ }
+ hardClusters.get(maxIndex).add(iditer);
+ }
+ Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
+ // provide models within the result
+ for(int i = 0; i < k; i++) {
+ // TODO: re-do labeling.
+ // SimpleClassLabel label = new SimpleClassLabel();
+ // label.init(result.canonicalClusterLabel(i));
+ Cluster<M> model = new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster());
+ result.addToplevelCluster(model);
+ }
+ if(isSoft()) {
+ result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
+ }
+ else {
+ probClusterIGivenX.destroy();
+ }
+ return result;
+ }
+
+ /**
+ * Recompute the covariance matrixes.
+ *
+ * @param relation Vector data
+ * @param probClusterIGivenX Object probabilities
+ * @param models Cluster models to update
+ */
+ public static void recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models) {
+ for(EMClusterModel<?> m : models) {
+ m.beginEStep();
+ }
+ double[] wsum = new double[models.size()];
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
+ NumberVector instance = relation.get(iditer);
+ int i = 0;
+ for(EMClusterModel<?> m : models) {
+ final double prior = clusterProbabilities[i];
+ if(prior > 0.) {
+ m.updateE(instance, prior);
+ }
+ wsum[i] += prior;
+ ++i;
+ }
+ }
+ int i = 0;
+ for(EMClusterModel<?> m : models) {
+ m.finalizeEStep();
+ m.setWeight(wsum[i] / relation.size());
+ i++;
+ }
+ }
+
+ /**
+ * Assigns the current probability values to the instances in the database and
+ * compute the expectation value of the current mixture of distributions.
+ *
+ * Computed as the sum of the logarithms of the prior probability of each
+ * instance.
+ *
+ * @param relation the database used for assignment to instances
+ * @param models Cluster models
+ * @param probClusterIGivenX Output storage for cluster probabilities
+ * @return the expectation value of the current mixture of distributions
+ */
+ public static double assignProbabilitiesToInstances(Relation<? extends NumberVector> relation, List<? extends EMClusterModel<?>> models, WritableDataStore<double[]> probClusterIGivenX) {
+ final int k = models.size();
+ double emSum = 0.;
+
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ NumberVector vec = relation.get(iditer);
+ double[] probabilities = new double[k];
+ {
+ int i = 0;
+ for(EMClusterModel<?> m : models) {
+ probabilities[i] = m.estimateDensity(vec);
+ ++i;
+ }
+ }
+ double priorProbability = 0.;
+ for(int i = 0; i < k; i++) {
+ priorProbability += probabilities[i];
+ }
+ double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD);
+ emSum += (logP == logP) ? logP : 0.; /* avoid NaN */
+
+ double[] clusterProbabilities = new double[k];
+ if(priorProbability > 0.) {
+ for(int i = 0; i < k; i++) {
+ // do not divide by zero!
+ clusterProbabilities[i] = probabilities[i] / priorProbability;
+ }
+ }
+ probClusterIGivenX.put(iditer, clusterProbabilities);
+ }
+
+ return emSum / relation.size();
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * @return the soft
+ */
+ public boolean isSoft() {
+ return soft;
+ }
+
+ /**
+ * @param soft the soft to set
+ */
+ public void setSoft(boolean soft) {
+ this.soft = soft;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector, M extends MeanModel> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the termination criterion for maximization of E(M):
+ * E(M) - E(M') < em.delta, must be a double equal to or greater than 0.
+ */
+ public static final OptionID DELTA_ID = new OptionID("em.delta", //
+ "The termination criterion for maximization of E(M): " + //
+ "E(M) - E(M') < em.delta");
+
+ /**
+ * Parameter to specify the EM cluster models to use.
+ */
+ public static final OptionID INIT_ID = new OptionID("em.model", //
+ "Model factory.");
+
+ /**
+ * Number of clusters.
+ */
+ protected int k;
+
+ /**
+ * Stopping threshold
+ */
+ protected double delta;
+
+ /**
+ * Initialization method
+ */
+ protected EMClusterModelFactory<V, M> initializer;
+
+ /**
+ * Maximum number of iterations.
+ */
+ protected int maxiter = -1;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<EMClusterModelFactory<V, M>> initialP = new ObjectParameter<>(INIT_ID, EMClusterModelFactory.class, MultivariateGaussianModelFactory.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(deltaP)) {
+ delta = deltaP.getValue();
+ }
+
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT) //
+ .setOptional(true);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected EM<V, M> makeInstance() {
+ return new EM<>(k, delta, initializer, maxiter, false);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java
new file mode 100644
index 00000000..f08e6444
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModel.java
@@ -0,0 +1,81 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+
+/**
+ * Models useable in EM clustering.
+ *
+ * @author Erich Schubert
+ */
+public interface EMClusterModel<M extends MeanModel> {
+ /**
+ * Begin the E step.
+ */
+ void beginEStep();
+
+ /**
+ * Update the
+ *
+ * @param vec Vector to process
+ * @param weight Weight
+ */
+ void updateE(NumberVector vec, double weight);
+
+ /**
+ * Finalize the E step.
+ */
+ void finalizeEStep();
+
+ /**
+ * Estimate the likelihood of a vector.
+ *
+ * @param vec Vector
+ * @return Likelihood.
+ */
+ double estimateDensity(NumberVector vec);
+
+ /**
+ * Finalize a cluster model.
+ *
+ * @return Cluster model
+ */
+ M finalizeCluster();
+
+ /**
+ * Get the cluster weight.
+ *
+ * @return Cluster weight
+ */
+ double getWeight();
+
+ /**
+ * Set the cluster weight.
+ *
+ * @param weight Cluster weight
+ */
+ void setWeight(double weight);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java
new file mode 100644
index 00000000..223ebcd6
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/EMClusterModelFactory.java
@@ -0,0 +1,52 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+
+/**
+ * Factory for initializing the EM models.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public interface EMClusterModelFactory<V extends NumberVector, M extends MeanModel> {
+ /**
+ * Build the initial models
+ *
+ * @param database Database
+ * @param relation Relation
+ * @param k Number of clusters
+ * @param df Distance function
+ * @return Initial models
+ */
+ List<? extends EMClusterModel<M>> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java
new file mode 100644
index 00000000..e134edff
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModel.java
@@ -0,0 +1,212 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LUDecomposition;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Model for a single Gaussian cluster.
+ *
+ * @author Erich Schubert
+ */
+public class MultivariateGaussianModel implements EMClusterModel<EMModel> {
+ /**
+ * Class logger.
+ */
+ private static Logging LOG = Logging.getLogger(MultivariateGaussianModel.class);
+
+ /**
+ * Mean vector.
+ */
+ Vector mean;
+
+ /**
+ * Covariance matrix, and inverse.
+ */
+ Matrix covariance, invCovMatr;
+
+ /**
+ * Temporary storage, to avoid reallocations.
+ */
+ double[] nmea, mref;
+
+ /**
+ * Matrix element reference.
+ */
+ double[][] elements;
+
+ /**
+ * Normalization factor.
+ */
+ double norm, normDistrFactor;
+
+ /**
+ * Weight aggregation sum
+ */
+ double weight, wsum;
+
+ /**
+ * Constructor.
+ *
+ * @param weight Cluster weight
+ * @param mean Initial mean
+ */
+ public MultivariateGaussianModel(double weight, Vector mean) {
+ this(weight, mean, MathUtil.powi(MathUtil.TWOPI, mean.getDimensionality()));
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param weight Cluster weight
+ * @param mean Initial mean
+ * @param norm Normalization factor.
+ */
+ public MultivariateGaussianModel(double weight, Vector mean, double norm) {
+ this.weight = weight;
+ final int dim = mean.getDimensionality();
+ this.mean = mean;
+ this.norm = norm;
+ this.normDistrFactor = 1. / Math.sqrt(norm); // assume det=1
+ this.mref = mean.getArrayRef();
+ this.nmea = new double[dim];
+ this.covariance = new Matrix(dim, dim);
+ this.elements = this.covariance.getArrayRef();
+ this.wsum = 0.;
+ }
+
+ @Override
+ public void beginEStep() {
+ if(covariance == null) {
+ covariance = new Matrix(mean.getDimensionality(), mean.getDimensionality());
+ return;
+ }
+ wsum = 0.;
+ }
+
+ @Override
+ public void updateE(NumberVector vec, double wei) {
+ assert (vec.getDimensionality() == mref.length);
+ final double nwsum = wsum + wei;
+ // Compute new means
+ for(int i = 0; i < mref.length; i++) {
+ final double delta = vec.doubleValue(i) - mref[i];
+ final double rval = delta * wei / nwsum;
+ nmea[i] = mref[i] + rval;
+ }
+ // Update covariance matrix
+ for(int i = 0; i < mref.length; i++) {
+ for(int j = i; j < mref.length; j++) {
+ // We DO want to use the new mean once and the old mean once!
+ // It does not matter which one is which.
+ double delta = (vec.doubleValue(i) - nmea[i]) * (vec.doubleValue(j) - mref[j]) * wei;
+ elements[i][j] = elements[i][j] + delta;
+ // Optimize via symmetry
+ if(i != j) {
+ elements[j][i] = elements[j][i] + delta;
+ }
+ }
+ }
+ // Use new values.
+ wsum = nwsum;
+ System.arraycopy(nmea, 0, mref, 0, nmea.length);
+ }
+
+ @Override
+ public void finalizeEStep() {
+ final int dim = mean.getDimensionality();
+ // TODO: improve handling of degenerated cases?
+ if(wsum > 0.) {
+ covariance.timesEquals(1. / wsum);
+ }
+ LUDecomposition lu = new LUDecomposition(covariance);
+ double det = lu.det();
+ if(det <= 0.) {
+ // Add a small value to the diagonal
+ covariance.plusDiagonalEquals(Matrix.SINGULARITY_CHEAT);
+ lu = new LUDecomposition(covariance); // Should no longer be zero now.
+ det = lu.det();
+ assert (det > 0) : "Singularity cheat did not resolve zero determinant.";
+ }
+ normDistrFactor = 1. / Math.sqrt(norm * det);
+ invCovMatr = lu.solve(Matrix.identity(dim, dim));
+ }
+
+ /**
+ * Compute the Mahalanobis distance from the centroid for a given vector.
+ *
+ * @param vec Vector
+ * @return Mahalanobis distance
+ */
+ public double mahalanobisDistance(Vector vec) {
+ if (invCovMatr != null) {
+ return VMath.mahalanobisDistance(invCovMatr.getArrayRef(), vec.getArrayRef(), mref);
+ }
+ Vector difference = vec.minus(mean);
+ return difference.transposeTimes(difference);
+ }
+
+ /**
+ * Compute the Mahalanobis distance from the centroid for a given vector.
+ *
+ * @param vec Vector
+ * @return Mahalanobis distance
+ */
+ public double mahalanobisDistance(NumberVector vec) {
+ Vector difference = vec.getColumnVector().minusEquals(mean);
+ return (invCovMatr != null) ? difference.transposeTimesTimes(invCovMatr, difference) : difference.transposeTimes(difference);
+ }
+
+ @Override
+ public double estimateDensity(NumberVector vec) {
+ double power = mahalanobisDistance(vec) * .5;
+ double prob = normDistrFactor * Math.exp(-power);
+ if(!(prob >= 0.)) {
+ LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor);
+ prob = 0.;
+ }
+ return prob * weight;
+ }
+
+ @Override
+ public double getWeight() {
+ return weight;
+ }
+
+ @Override
+ public void setWeight(double weight) {
+ this.weight = weight;
+ }
+
+ @Override
+ public EMModel finalizeCluster() {
+ return new EMModel(mean, covariance);
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java
new file mode 100644
index 00000000..ca0b7db4
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/MultivariateGaussianModelFactory.java
@@ -0,0 +1,89 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Factory for EM with multivariate gaussian models (with covariance; also known
+ * as Gaussian Mixture Modeling, GMM).
+ *
+ * These models have individual covariance matrixes, so this corresponds to the
+ * {@code 'VVV'} model in Mclust (R).
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MultivariateGaussianModel
+ *
+ * @param <V> vector type
+ */
+public class MultivariateGaussianModelFactory<V extends NumberVector> extends AbstractEMModelFactory<V, EMModel> {
+ /**
+ * Constructor.
+ *
+ * @param initializer Class for choosing the inital seeds.
+ */
+ public MultivariateGaussianModelFactory(KMeansInitialization<V> initializer) {
+ super(initializer);
+ }
+
+ @Override
+ public List<MultivariateGaussianModel> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df) {
+ final List<Vector> initialMeans = initializer.chooseInitialMeans(database, relation, k, df, Vector.FACTORY);
+ assert (initialMeans.size() == k);
+ final int dimensionality = initialMeans.get(0).getDimensionality();
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality);
+ List<MultivariateGaussianModel> models = new ArrayList<>(k);
+ for(Vector nv : initialMeans) {
+ models.add(new MultivariateGaussianModel(1. / k, nv, norm));
+ }
+ return models;
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractEMModelFactory.Parameterizer<V> {
+ @Override
+ protected MultivariateGaussianModelFactory<V> makeInstance() {
+ return new MultivariateGaussianModelFactory<>(initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java
new file mode 100644
index 00000000..b1d62a17
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModel.java
@@ -0,0 +1,190 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Simple spherical Gaussian cluster.
+ *
+ * @author Erich Schubert
+ */
+public class SphericalGaussianModel implements EMClusterModel<EMModel> {
+ /**
+ * Class logger.
+ */
+ private static Logging LOG = Logging.getLogger(SphericalGaussianModel.class);
+
+ /**
+ * Mean vector.
+ */
+ Vector mean;
+
+ /**
+ * Variances.
+ */
+ double variance;
+
+ /**
+ * Temporary storage, to avoid reallocations.
+ */
+ double[] nmea, mref;
+
+ /**
+ * Normalization factor.
+ */
+ double norm, normDistrFactor;
+
+ /**
+ * Weight aggregation sum
+ */
+ double weight, wsum;
+
+ /**
+ * Constructor.
+ *
+ * @param weight Cluster weight
+ * @param mean Initial mean
+ */
+ public SphericalGaussianModel(double weight, Vector mean) {
+ this(weight, mean, MathUtil.powi(MathUtil.TWOPI, mean.getDimensionality()));
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param weight Cluster weight
+ * @param mean Initial mean
+ * @param norm Normalization factor.
+ */
+ public SphericalGaussianModel(double weight, Vector mean, double norm) {
+ this.weight = weight;
+ final int dim = mean.getDimensionality();
+ this.mean = mean;
+ this.norm = norm;
+ this.normDistrFactor = 1. / Math.sqrt(norm); // assume det=1
+ this.mref = mean.getArrayRef();
+ this.nmea = new double[dim];
+ this.variance = 1.;
+ this.wsum = 0.;
+ }
+
+ @Override
+ public void beginEStep() {
+ wsum = 0.;
+ }
+
+ @Override
+ public void updateE(NumberVector vec, double wei) {
+ assert (vec.getDimensionality() == mref.length);
+ final double nwsum = wsum + wei;
+ // Compute new means
+ for(int i = 0; i < mref.length; i++) {
+ final double delta = vec.doubleValue(i) - mref[i];
+ final double rval = delta * wei / nwsum;
+ nmea[i] = mref[i] + rval;
+ }
+ // Update variances
+ for(int i = 0; i < mref.length; i++) {
+ // We DO want to use the new mean once and the old mean once!
+ // It does not matter which one is which.
+ variance += (vec.doubleValue(i) - nmea[i]) * (vec.doubleValue(i) - mref[i]) * wei;
+ }
+ // Use new values.
+ wsum = nwsum;
+ System.arraycopy(nmea, 0, mref, 0, nmea.length);
+ }
+
+ @Override
+ public void finalizeEStep() {
+ if(wsum > 0.) {
+ variance = variance / (wsum * mref.length);
+ normDistrFactor = 1. / Math.sqrt(norm * variance);
+ }
+ else {
+ // Degenerate
+ normDistrFactor = 1. / Math.sqrt(norm);
+ }
+ }
+
+ /**
+ * Compute the Mahalanobis distance from the centroid for a given vector.
+ *
+ * @param vec Vector
+ * @return Mahalanobis distance
+ */
+ public double mahalanobisDistance(Vector vec) {
+ double[] difference = vec.minus(mean).getArrayRef();
+ double agg = 0.;
+ for(int i = 0; i < difference.length; i++) {
+ agg += difference[i] / variance * difference[i];
+ }
+ return agg;
+ }
+
+ /**
+ * Compute the Mahalanobis distance from the centroid for a given vector.
+ *
+ * @param vec Vector
+ * @return Mahalanobis distance
+ */
+ public double mahalanobisDistance(NumberVector vec) {
+ double[] difference = vec.getColumnVector().minusEquals(mean).getArrayRef();
+ double agg = 0.;
+ for(int i = 0; i < difference.length; i++) {
+ agg += difference[i] / variance * difference[i];
+ }
+ return agg;
+ }
+
+ @Override
+ public double estimateDensity(NumberVector vec) {
+ double power = mahalanobisDistance(vec) * .5;
+ double prob = normDistrFactor * Math.exp(-power);
+ if(!(prob >= 0.)) {
+ LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor);
+ prob = 0.;
+ }
+ return prob * weight;
+ }
+
+ @Override
+ public double getWeight() {
+ return weight;
+ }
+
+ @Override
+ public void setWeight(double weight) {
+ this.weight = weight;
+ }
+
+ @Override
+ public EMModel finalizeCluster() {
+ return new EMModel(mean, Matrix.identity(nmea.length, nmea.length).timesEquals(variance));
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java
new file mode 100644
index 00000000..7de77188
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/SphericalGaussianModelFactory.java
@@ -0,0 +1,88 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Factory for EM with multivariate gaussian models using diagonal matrixes.
+ *
+ * These models have individual variances, but no covariance, so this
+ * corresponds to the {@code 'VVI'} model in Mclust (R).
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has SphericalGaussianModel
+ *
+ * @param <V> vector type
+ */
+public class SphericalGaussianModelFactory<V extends NumberVector> extends AbstractEMModelFactory<V, EMModel> {
+ /**
+ * Constructor.
+ *
+ * @param initializer Class for choosing the inital seeds.
+ */
+ public SphericalGaussianModelFactory(KMeansInitialization<V> initializer) {
+ super(initializer);
+ }
+
+ @Override
+ public List<SphericalGaussianModel> buildInitialModels(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector> df) {
+ final List<Vector> initialMeans = initializer.chooseInitialMeans(database, relation, k, df, Vector.FACTORY);
+ assert (initialMeans.size() == k);
+ final int dimensionality = initialMeans.get(0).getDimensionality();
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality);
+ List<SphericalGaussianModel> models = new ArrayList<>(k);
+ for(Vector nv : initialMeans) {
+ models.add(new SphericalGaussianModel(1. / k, nv, norm));
+ }
+ return models;
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractEMModelFactory.Parameterizer<V> {
+ @Override
+ protected SphericalGaussianModelFactory<V> makeInstance() {
+ return new SphericalGaussianModelFactory<>(initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java
new file mode 100644
index 00000000..22f0b669
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/em/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Expectation-Maximization clustering algorithm.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.em; \ No newline at end of file