1 files changed, 65 insertions, 70 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
index 9d1ee94d..4e1de7c7 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
  This file is part of ELKI:
  Environment for Developing KDD-Applications Supported by Index-Structures
 
- Copyright (C) 2013
+ Copyright (C) 2014
  Ludwig-Maximilians-Universität München
  Lehr- und Forschungseinheit für Datenbanksysteme
  ELKI Development Team
@@ -25,12 +25,13 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
 
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.BitSet;
 import java.util.Iterator;
 import java.util.List;
 
 import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.EM;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.EMClusterModel;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel;
 import de.lmu.ifi.dbs.elki.data.Cluster;
 import de.lmu.ifi.dbs.elki.data.Clustering;
 import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -61,7 +62,6 @@ import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
 import de.lmu.ifi.dbs.elki.math.MathUtil;
 import de.lmu.ifi.dbs.elki.math.MeanVariance;
 import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
 import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
 import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
 import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
@@ -103,7 +103,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
  */
 @Title("P3C: A Robust Projected Clustering Algorithm.")
 @Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
-public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+public class P3C<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
   /**
    * The logger for this class.
    */
@@ -156,7 +156,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
   /**
    * Performs the P3C algorithm on the given Database.
    */
-  public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+  public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
     final int dim = RelationUtil.dimensionality(relation);
 
     // Overall progress.
@@ -167,7 +167,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
     }
 
     // Desired number of bins, as per Sturge:
-    final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
 
     // Perform 1-dimensional projections, and split into bins.
     SetDBIDs[][] partitions = partitionData(relation, binCount);
@@ -178,7 +178,6 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
 
     // Set markers for each attribute until they're all deemed uniform.
     final long[][] markers = new long[dim][];
-    int numuniform = 0;
     for(int d = 0; d < dim; d++) {
       final SetDBIDs[] parts = partitions[d];
       if(parts == null) {
@@ -191,7 +190,6 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
         // previously marked.
         int bestBin = chiSquaredUniformTest(parts, marked, card);
         if(bestBin < 0) {
-          numuniform++;
           break; // Uniform
         }
         BitsUtil.setI(marked, bestBin);
@@ -224,9 +222,9 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
     }
 
     if(clusterCores.size() == 0) {
-      stepProgress.setCompleted(LOG);
-      Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
-      c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+      LOG.setCompleted(stepProgress);
+      Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
+      c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
       return c;
     }
 
@@ -238,26 +236,19 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
     ModifiableDBIDs noise = DBIDUtil.newHashSet();
     WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
     int k = clusterCores.size();
-    double[] clusterWeights = new double[k];
-    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+    List<MultivariateGaussianModel> models = new ArrayList<>(k);
+    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
 
     // Initial estimate of covariances, to assign noise objects
-    Vector[] means = new Vector[k];
-    Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
-    final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
-    double[] normDistrFactor = new double[k];
-    Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
-    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
-    EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
-    assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
-
-    double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
+    assignUnassigned(relation, probClusterIGivenX, models, noise);
+
+    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
     for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
       final double emOld = emNew;
-      EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
-      EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+      EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
       // reassign probabilities
-      emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+      emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
 
       if(LOG.isVerbose()) {
         LOG.verbose("iteration " + it + " - expectation value: " + emNew);
@@ -283,7 +274,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
 
     // Outlier detection. Remove points from clusters that have a Mahalanobis
     // distance larger than the critical value of the ChiSquare distribution.
-    findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+    findOutliers(relation, models, clusterCandidates, noise);
 
     if(stepProgress != null) {
       stepProgress.beginStep(8, "Removing empty clusters.", LOG);
@@ -312,20 +303,18 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
     }
 
     // Generate final output.
-    Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
     for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
       ClusterCandidate candidate = clusterCandidates.get(cluster);
       CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
-      result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+      result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
     }
     LOG.verbose("Noise size: " + noise.size());
     if(noise.size() > 0) {
-      result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+      result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
     }
 
-    if(stepProgress != null) {
-      stepProgress.ensureCompleted(LOG);
-    }
+    LOG.ensureCompleted(stepProgress);
 
     return result;
   }
@@ -500,7 +489,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
    * @param parts Parts array
    * @param start Array start index
    * @param end Array end index (exclusive)
-   * @return
+   * @return Union
    */
   protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
     int sum = 0;
@@ -561,12 +550,15 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
    * @param clusterCores the cluster cores.
    * @param unassigned set to which to add unassigned points.
    * @param probClusterIGivenX Membership probabilities.
-   * @param clusterWeights Cluster weights
+   * @param models Cluster models.
+   * @param dim Dimensionality
    */
-  private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+  private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, int dim) {
     final int n = relation.size();
+    final double pweight = 1. / n; // Weight of each point
     final int k = clusterCores.size();
 
+    double[] clusterWeights = new double[k];
     for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
       int count = 0;
       double[] weights = new double[k];
@@ -581,7 +573,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
       if(count > 0) {
         // Rescale.
         VMath.timesEquals(weights, 1. / count);
-        VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+        VMath.plusTimesEquals(clusterWeights, weights, pweight);
       }
       else {
         // Does not match any cluster, mark it.
@@ -589,6 +581,10 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
       }
       probClusterIGivenX.put(iter, weights);
     }
+    final double f = MathUtil.powi(MathUtil.TWOPI, dim); // Scaling coefficient
+    for(int i = 0; i < k; i++) {
+      models.add(new MultivariateGaussianModel(clusterWeights[i], new Vector(dim), f));
+    }
   }
 
   /**
@@ -597,36 +593,44 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
    * 
    * @param relation Data relation
    * @param probClusterIGivenX fuzzy membership matrix.
-   * @param means Cluster means.
-   * @param invCovMatr Cluster covariance matrices.
-   * @param clusterWeights
-   * @param assigned mapping of matrix row to DBID.
+   * @param models Cluster models.
    * @param unassigned the list of points not yet assigned.
    */
-  private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+  private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, ModifiableDBIDs unassigned) {
     if(unassigned.size() == 0) {
       return;
     }
-    final int k = means.length;
+    final int k = models.size();
     double pweight = 1. / relation.size();
 
+    // Rescale weights, to take unassigned points into account:
+    for(EMClusterModel<?> m : models) {
+      m.setWeight(m.getWeight() * (relation.size() - unassigned.size()) * pweight);
+    }
+
+    // Assign noise objects, increase weights accordingly.
     for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
       // Find the best matching known cluster core using the Mahalanobis
       // distance.
-      Vector v = relation.get(iter).getColumnVector();
+      V v = relation.get(iter);
       int bestCluster = -1;
+      MultivariateGaussianModel bestModel = null;
       double minDistance = Double.POSITIVE_INFINITY;
-      for(int c = 0; c < k; ++c) {
-        final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+      int c = 0;
+      for(MultivariateGaussianModel model : models) {
+        final double distance = model.mahalanobisDistance(v);
         if(distance < minDistance) {
           minDistance = distance;
           bestCluster = c;
+          bestModel = model;
         }
+        c++;
       }
       // Assign to best core.
       double[] weights = new double[k];
-      weights[bestCluster] = 1.0;
-      clusterWeights[bestCluster] += pweight;
+      weights[bestCluster] = 1.;
+
+      bestModel.setWeight(bestModel.getWeight() + pweight);
       probClusterIGivenX.put(iter, weights);
     }
 
@@ -675,28 +679,19 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
    * attributes.
    * 
    * @param relation Data relation
-   * @param means Cluster means
-   * @param invCovMatr Inverse covariance matrixes
+   * @param models Cluster models
    * @param clusterCandidates the list of clusters to check.
-   * @param nonUniformDimensionCount the number of dimensions to consider when
-   *        testing.
    * @param noise the set to which to add points deemed outliers.
    */
-  private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
-    final int k = clusterCandidates.size();
-
-    for(int c = 0; c < k; ++c) {
+  private void findOutliers(Relation<V> relation, List<MultivariateGaussianModel> models, ArrayList<ClusterCandidate> clusterCandidates, ModifiableDBIDs noise) {
+    Iterator<MultivariateGaussianModel> it = models.iterator();
+    for(int c = 0; it.hasNext(); c++) {
+      MultivariateGaussianModel model = it.next();
       final ClusterCandidate candidate = clusterCandidates.get(c);
-      if(candidate.ids.size() < 2) {
-        continue;
-      }
-      final int dof = candidate.dimensions.cardinality();
-      final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+      final int dof = BitsUtil.cardinality(candidate.dimensions);
+      final double threshold = ChiSquaredDistribution.quantile(1 - alpha, dof);
       for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
-        final Vector mean = means[c];
-        final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
-        final Matrix invCov = invCovMatr[c];
-        final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+        final double distance = model.mahalanobisDistance(relation.get(iter));
         if(distance >= threshold) {
           // Outlier, remove it and add it to the outlier set.
           noise.add(iter);
@@ -851,7 +846,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
     /**
      * Selected dimensions
      */
-    public final BitSet dimensions;
+    public final long[] dimensions;
 
     /**
      * Objects contained in cluster.
@@ -864,9 +859,9 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
      * @param clusterCore Signature
      */
     public ClusterCandidate(Signature clusterCore) {
-      this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+      this.dimensions = BitsUtil.zero(clusterCore.spec.length >> 1);
       for(int i = 0; i < clusterCore.spec.length; i += 2) {
-        this.dimensions.set(i >> 1);
+        BitsUtil.setI(this.dimensions, i >> 1);
       }
       this.ids = DBIDUtil.newArray(clusterCore.ids.size());
     }
@@ -889,7 +884,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
    * 
    * @apiviz.exclude
    */
-  public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+  public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
     /**
      * Parameter for the chi squared test threshold.
      */