package de.lmu.ifi.dbs.elki.algorithm.clustering; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.List; import java.util.Random; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.EMModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * Provides the EM algorithm (clustering by expectation maximization). *

* Initialization is implemented as random initialization of means (uniformly * distributed within the attribute ranges of the given database) and initial * zero-covariance and variance=1 in covariance matrices. *

* Reference: A. P. Dempster, N. M. Laird, D. B. Rubin: Maximum Likelihood from * Incomplete Data via the EM algorithm.
* In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31 *

* * @author Arthur Zimek * * @apiviz.has EMModel * * @param a type of {@link NumberVector} as a suitable datatype for this * algorithm */ @Title("EM-Clustering: Clustering by Expectation Maximization") @Description("Provides k Gaussian mixtures maximizing the probability of the given data") @Reference(authors = "A. P. Dempster, N. M. Laird, D. B. Rubin", title = "Maximum Likelihood from Incomplete Data via the EM algorithm", booktitle = "Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31", url = "http://www.jstor.org/stable/2984875") public class EM> extends AbstractAlgorithm>> implements ClusteringAlgorithm>> { /** * The logger for this class. */ private static final Logging logger = Logging.getLogger(EM.class); /** * Small value to increment diagonally of a matrix in order to avoid * singularity before building the inverse. */ private static final double SINGULARITY_CHEAT = 1E-9; /** * Parameter to specify the number of clusters to find, must be an integer * greater than 0. */ public static final OptionID K_ID = OptionID.getOrCreateOptionID("em.k", "The number of clusters to find."); /** * Holds the value of {@link #K_ID}. */ private int k; /** * Parameter to specify the termination criterion for maximization of E(M): * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. */ public static final OptionID DELTA_ID = OptionID.getOrCreateOptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta"); private static final double MIN_LOGLIKELIHOOD = -100000; /** * Holds the value of {@link #DELTA_ID}. */ private double delta; /** * Parameter to specify the random generator seed. */ public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("em.seed", "The random number generator seed."); /** * Holds the value of {@link #SEED_ID}. */ private Long seed; /** * Store the individual probabilities, for use by EMOutlierDetection etc. */ private WritableDataStore probClusterIGivenX; /** * Constructor. * * @param k k parameter * @param delta delta parameter * @param seed Seed parameter */ public EM(int k, double delta, Long seed) { super(); this.k = k; this.delta = delta; this.seed = seed; } /** * Performs the EM clustering algorithm on the given database. *

* Finally a hard clustering is provided where each clusters gets assigned the * points exhibiting the highest probability to belong to this cluster. But * still, the database objects hold associated the complete probability-vector * for all models. * * @param database Database * @param relation Relation * @return Result */ public Clustering> run(Database database, Relation relation) { if(relation.size() == 0) { throw new IllegalArgumentException("database empty: must contain elements"); } // initial models if(logger.isVerbose()) { logger.verbose("initializing " + k + " models"); } List means = initialMeans(relation); List covarianceMatrices = new ArrayList(k); List normDistrFactor = new ArrayList(k); List invCovMatr = new ArrayList(k); List clusterWeights = new ArrayList(k); probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); int dimensionality = means.get(0).getDimensionality(); for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); covarianceMatrices.add(m); normDistrFactor.add(1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det())); invCovMatr.add(m.inverse()); clusterWeights.add(1.0 / k); if(logger.isDebuggingFinest()) { StringBuffer msg = new StringBuffer(); msg.append(" model ").append(i).append(":\n"); msg.append(" mean: ").append(means.get(i)).append("\n"); msg.append(" m:\n").append(FormatUtil.format(m, " ")).append("\n"); msg.append(" m.det(): ").append(m.det()).append("\n"); msg.append(" cluster weight: ").append(clusterWeights.get(i)).append("\n"); msg.append(" normDistFact: ").append(normDistrFactor.get(i)).append("\n"); logger.debugFine(msg.toString()); } } double emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX); // iteration unless no change if(logger.isVerbose()) { logger.verbose("iterating EM"); } double em; int it = 0; do { it++; if(logger.isVerbose()) { logger.verbose("iteration " + it + " - expectation value: " + emNew); } em = emNew; // recompute models List meanSums = new ArrayList(k); double[] sumOfClusterProbabilities = new double[k]; for(int i = 0; i < k; i++) { clusterWeights.set(i, 0.0); meanSums.add(means.get(i).nullVector()); covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality)); } // weights and means for(DBID id : relation.iterDBIDs()) { double[] clusterProbabilities = probClusterIGivenX.get(id); for(int i = 0; i < k; i++) { sumOfClusterProbabilities[i] += clusterProbabilities[i]; V summand = relation.get(id).multiplicate(clusterProbabilities[i]); V currentMeanSum = meanSums.get(i).plus(summand); meanSums.set(i, currentMeanSum); } } final int n = relation.size(); for(int i = 0; i < k; i++) { clusterWeights.set(i, sumOfClusterProbabilities[i] / n); V newMean = meanSums.get(i).multiplicate(1 / sumOfClusterProbabilities[i]); means.set(i, newMean); } // covariance matrices for(DBID id : relation.iterDBIDs()) { double[] clusterProbabilities = probClusterIGivenX.get(id); V instance = relation.get(id); for(int i = 0; i < k; i++) { V difference = instance.minus(means.get(i)); covarianceMatrices.get(i).plusEquals(difference.getColumnVector().times(difference.getRowVector()).times(clusterProbabilities[i])); } } for(int i = 0; i < k; i++) { covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT)); } for(int i = 0; i < k; i++) { normDistrFactor.set(i, 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * covarianceMatrices.get(i).det())); invCovMatr.set(i, covarianceMatrices.get(i).inverse()); } // reassign probabilities emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX); } while(Math.abs(em - emNew) > delta); if(logger.isVerbose()) { logger.verbose("assigning clusters"); } // fill result with clusters and models List hardClusters = new ArrayList(k); for(int i = 0; i < k; i++) { hardClusters.add(DBIDUtil.newHashSet()); } // provide a hard clustering for(DBID id : relation.iterDBIDs()) { double[] clusterProbabilities = probClusterIGivenX.get(id); int maxIndex = 0; double currentMax = 0.0; for(int i = 0; i < k; i++) { if(clusterProbabilities[i] > currentMax) { maxIndex = i; currentMax = clusterProbabilities[i]; } } hardClusters.get(maxIndex).add(id); } Clustering> result = new Clustering>("EM Clustering", "em-clustering"); // provide models within the result for(int i = 0; i < k; i++) { // TODO: re-do labeling. // SimpleClassLabel label = new SimpleClassLabel(); // label.init(result.canonicalClusterLabel(i)); Cluster> model = new Cluster>(hardClusters.get(i), new EMModel(means.get(i), covarianceMatrices.get(i))); result.addCluster(model); } return result; } /** * Assigns the current probability values to the instances in the database and * compute the expectation value of the current mixture of distributions. * * Computed as the sum of the logarithms of the prior probability of each * instance. * * @param database the database used for assignment to instances * @param normDistrFactor normalization factor for density function, based on * current covariance matrix * @param means the current means * @param invCovMatr the inverse covariance matrices * @param clusterWeights the weights of the current clusters * @return the expectation value of the current mixture of distributions */ protected double assignProbabilitiesToInstances(Relation database, List normDistrFactor, List means, List invCovMatr, List clusterWeights, WritableDataStore probClusterIGivenX) { double emSum = 0.0; for(DBID id : database.iterDBIDs()) { V x = database.get(id); List probabilities = new ArrayList(k); for(int i = 0; i < k; i++) { V difference = x.minus(means.get(i)); Matrix differenceRow = difference.getRowVector(); Vector differenceCol = difference.getColumnVector(); Matrix rowTimesCov = differenceRow.times(invCovMatr.get(i)); Vector rowTimesCovTimesCol = rowTimesCov.times(differenceCol); double power = rowTimesCovTimesCol.get(0, 0) / 2.0; double prob = normDistrFactor.get(i) * Math.exp(-power); if(logger.isDebuggingFinest()) { logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " differenceRow:\n" + FormatUtil.format(differenceRow, " ") + "\n" + " differenceCol:\n" + FormatUtil.format(differenceCol, " ") + "\n" + " rowTimesCov:\n" + FormatUtil.format(rowTimesCov, " ") + "\n" + " rowTimesCovTimesCol:\n" + FormatUtil.format(rowTimesCovTimesCol, " ") + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); } probabilities.add(prob); } double priorProbability = 0.0; for(int i = 0; i < k; i++) { priorProbability += probabilities.get(i) * clusterWeights.get(i); } double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD); if(!Double.isNaN(logP)) { emSum += logP; } double[] clusterProbabilities = new double[k]; for(int i = 0; i < k; i++) { assert (priorProbability >= 0.0); assert (clusterWeights.get(i) >= 0.0); // do not divide by zero! if(priorProbability == 0.0) { clusterProbabilities[i] = 0.0; } else { clusterProbabilities[i] = probabilities.get(i) / priorProbability * clusterWeights.get(i); } } probClusterIGivenX.put(id, clusterProbabilities); } return emSum; } /** * Creates {@link #k k} random points distributed uniformly within the * attribute ranges of the given database. * * @param relation the database must contain enough points in order to * ascertain the range of attribute values. Less than two points would * make no sense. The content of the database is not touched otherwise. * @return a list of {@link #k k} random points distributed uniformly within * the attribute ranges of the given database */ protected List initialMeans(Relation relation) { final Random random; if(this.seed != null) { random = new Random(this.seed); } else { random = new Random(); } if(relation.size() > 0) { final int dim = DatabaseUtil.dimensionality(relation); Pair minmax = DatabaseUtil.computeMinMax(relation); List means = new ArrayList(k); if(logger.isVerbose()) { logger.verbose("initializing random vectors"); } for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); // Rescale for (int d = 0; d < dim; d++) { r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; } // Instantiate V randomVector = minmax.first.newInstance(r); means.add(randomVector); } return means; } else { return new ArrayList(0); } } /** * Get the probabilities for a given point. * * @param index Point ID * @return Probabilities of given point */ public double[] getProbClusterIGivenX(DBID index) { return probClusterIGivenX.get(index); } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return logger; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractParameterizer { protected int k; protected double delta; protected Long seed; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); if(config.grab(kP)) { k = kP.getValue(); } DoubleParameter deltaP = new DoubleParameter(DELTA_ID, new GreaterEqualConstraint(0.0), 0.0); if(config.grab(deltaP)) { delta = deltaP.getValue(); } LongParameter seedP = new LongParameter(SEED_ID, true); if(config.grab(seedP)) { seed = seedP.getValue(); } } @Override protected EM makeInstance() { return new EM(k, delta, seed); } } }