1 files changed, 100 insertions, 91 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
index b3e6bb76..b5e9be6b 100644
--- a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
  This file is part of ELKI:
  Environment for Developing KDD-Applications Supported by Index-Structures
 
- Copyright (C) 2012
+ Copyright (C) 2014
  Ludwig-Maximilians-Universität München
  Lehr- und Forschungseinheit für Datenbanksysteme
  ELKI Development Team
@@ -54,8 +54,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
  * 
  * @author Erich Schubert
  */
-@Reference(authors = "D. Guo", title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", booktitle = "Information Visualization, 2(4)", url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
-public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+@Reference(authors = "D. Guo", //
+title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", //
+booktitle = "Information Visualization, 2(4)", //
+url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
+public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector> {
   /**
    * Static instance.
    */
@@ -77,11 +80,11 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
   }
 
   @Override
-  public void computeDimensionSimilarites(Database database, Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+  public void computeDimensionSimilarites(Database database, Relation<? extends NumberVector> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
     final int dim = matrix.size();
 
     // Find a number of bins as recommended by Cheng et al.
-    double p = Math.log(subset.size() / (double) TARGET) / MathUtil.LOG2;
+    double p = MathUtil.log2(subset.size() / (double) TARGET);
     // As we are in 2d, take the root (*.5) But let's use at least 1, too.
     // Check: for 10000 this should give 4, for 150 it gives 1.
     int power = Math.max(1, (int) Math.floor(p * .5));
@@ -92,18 +95,18 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
 
     // Partition sizes
     int[][] psizes = new int[dim][gridsize];
-    for (int d = 0; d < dim; d++) {
+    for(int d = 0; d < dim; d++) {
       final ArrayList<DBIDs> partsd = parts.get(d);
       final int[] sizesp = psizes[d];
-      for (int i = 0; i < gridsize; i++) {
+      for(int i = 0; i < gridsize; i++) {
         sizesp[i] = partsd.get(i).size();
       }
     }
 
     int[][] res = new int[gridsize][gridsize];
-    for (int x = 0; x < dim; x++) {
+    for(int x = 0; x < dim; x++) {
       ArrayList<DBIDs> partsi = parts.get(x);
-      for (int y = x + 1; y < dim; y++) {
+      for(int y = x + 1; y < dim; y++) {
         ArrayList<DBIDs> partsj = parts.get(y);
         // Fill the intersection matrix
         intersectionMatrix(res, partsi, partsj, gridsize);
@@ -113,69 +116,6 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
   }
 
   /**
-   * Intersect the two 1d grid decompositions, to obtain a 2d matrix.
-   * 
-   * @param res Output matrix to fill
-   * @param partsx Partitions in first component
-   * @param partsy Partitions in second component.
-   * @param gridsize Size of partition decomposition
-   */
-  private void intersectionMatrix(int[][] res, ArrayList<? extends DBIDs> partsx, ArrayList<? extends DBIDs> partsy, int gridsize) {
-    for (int x = 0; x < gridsize; x++) {
-      final DBIDs px = partsx.get(x);
-      final int[] rowx = res[x];
-      for (int y = 0; y < gridsize; y++) {
-        rowx[y] = DBIDUtil.intersectionSize(px, partsy.get(y));
-      }
-    }
-  }
-
-  /**
-   * Compute the MCE entropy value.
-   * 
-   * @param mat Partition size matrix
-   * @param psizesx Partition sizes on X
-   * @param psizesy Partition sizes on Y
-   * @param size Data set size
-   * @param gridsize Size of grids
-   * @param loggrid Logarithm of grid sizes, for normalization
-   * @return MCE score.
-   */
-  private double getMCEntropy(int[][] mat, int[] psizesx, int[] psizesy, int size, int gridsize, double loggrid) {
-    // Margin entropies:
-    double[] mx = new double[gridsize];
-    double[] my = new double[gridsize];
-
-    for (int i = 0; i < gridsize; i++) {
-      // Note: indexes are a bit tricky here, because we compute both margin
-      // entropies at the same time!
-      final double sumx = (double) psizesx[i];
-      final double sumy = (double) psizesy[i];
-      for (int j = 0; j < gridsize; j++) {
-        double px = mat[i][j] / sumx;
-        double py = mat[j][i] / sumy;
-
-        if (px > 0.) {
-          mx[i] -= px * Math.log(px);
-        }
-        if (py > 0.) {
-          my[i] -= py * Math.log(py);
-        }
-      }
-    }
-
-    // Weighted sums of margin entropies.
-    double sumx = 0., sumy = 0.;
-    for (int i = 0; i < gridsize; i++) {
-      sumx += mx[i] * psizesx[i];
-      sumy += my[i] * psizesy[i];
-    }
-
-    double max = ((sumx > sumy) ? sumx : sumy);
-    return max / (size * loggrid);
-  }
-
-  /**
    * Calculates "index structures" for every attribute, i.e. sorts a
    * ModifiableArray of every DBID in the database for every dimension and
    * stores them in a list.
@@ -185,14 +125,14 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
    * @param matrix Matrix for dimension information
    * @return List of sorted objects
    */
-  private ArrayList<ArrayList<DBIDs>> buildPartitions(Relation<? extends NumberVector<?>> relation, DBIDs ids, int depth, DimensionSimilarityMatrix matrix) {
+  private ArrayList<ArrayList<DBIDs>> buildPartitions(Relation<? extends NumberVector> relation, DBIDs ids, int depth, DimensionSimilarityMatrix matrix) {
     final int dim = matrix.size();
     ArrayList<ArrayList<DBIDs>> subspaceIndex = new ArrayList<>(dim);
     SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
     double[] tmp = new double[ids.size()];
     Mean mean = new Mean();
-
-    for (int i = 0; i < dim; i++) {
+  
+    for(int i = 0; i < dim; i++) {
       final int d = matrix.dim(i);
       // Index for a single dimension:
       ArrayList<DBIDs> idx = new ArrayList<>(1 << depth);
@@ -202,7 +142,7 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
       sids.sort(comp);
       // Now we build the temp array, and compute the first mean.
       DBIDArrayIter it = sids.iter();
-      for (int j = 0; j < tmp.length; j++, it.advance()) {
+      for(int j = 0; j < tmp.length; j++, it.advance()) {
         assert (it.valid());
         tmp[j] = relation.get(it).doubleValue(d);
       }
@@ -210,7 +150,7 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
       assert (idx.size() == (1 << depth));
       subspaceIndex.add(idx);
     }
-
+  
     return subspaceIndex;
   }
 
@@ -227,44 +167,50 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
    */
   private void divide(DBIDArrayIter it, double[] data, ArrayList<DBIDs> idx, int start, int end, int depth, Mean mean) {
     final int count = end - start;
-    if (depth == 0) {
-      if (count > 0) {
+    if(depth == 0) {
+      if(count > 0) {
         ModifiableDBIDs out = DBIDUtil.newHashSet(count);
         it.seek(start);
-        for (int i = count; i > 0; i--, it.advance()) {
+        for(int i = count; i > 0; i--, it.advance()) {
           out.add(it);
         }
         idx.add(out);
-      } else {
+      }
+      else {
         idx.add(DBIDUtil.EMPTYDBIDS);
       }
       return;
-    } else {
-      if (count > 0) {
+    }
+    else {
+      if(count > 0) {
         mean.reset();
-        for (int i = start; i < end; i++) {
+        for(int i = start; i < end; i++) {
           mean.put(data[i]);
         }
         final double m = mean.getMean();
         int pos = Arrays.binarySearch(data, start, end, m);
-        if (pos >= 0) {
+        if(pos >= 0) {
           // Ties: try to choose the most central element.
           int opt = (start + end) >> 1;
-          while (Double.compare(data[pos], m) == 0) {
-            if (pos < opt) {
+          while(Double.compare(data[pos], m) == 0) {
+            if(pos < opt) {
               pos++;
-            } else if (pos > opt) {
+            }
+            else if(pos > opt) {
               pos--;
-            } else {
+            }
+            else {
               break;
             }
           }
-        } else {
+        }
+        else {
           pos = (-pos - 1);
         }
         divide(it, data, idx, start, pos, depth - 1, mean);
         divide(it, data, idx, pos, end, depth - 1, mean);
-      } else {
+      }
+      else {
         // Corner case, that should barely happen. But for ties, we currently
         // Do not yet assure that it doesn't happen!
         divide(it, data, idx, start, end, depth - 1, mean);
@@ -274,6 +220,69 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
   }
 
   /**
+   * Intersect the two 1d grid decompositions, to obtain a 2d matrix.
+   * 
+   * @param res Output matrix to fill
+   * @param partsx Partitions in first component
+   * @param partsy Partitions in second component.
+   * @param gridsize Size of partition decomposition
+   */
+  private void intersectionMatrix(int[][] res, ArrayList<? extends DBIDs> partsx, ArrayList<? extends DBIDs> partsy, int gridsize) {
+    for(int x = 0; x < gridsize; x++) {
+      final DBIDs px = partsx.get(x);
+      final int[] rowx = res[x];
+      for(int y = 0; y < gridsize; y++) {
+        rowx[y] = DBIDUtil.intersectionSize(px, partsy.get(y));
+      }
+    }
+  }
+
+  /**
+   * Compute the MCE entropy value.
+   * 
+   * @param mat Partition size matrix
+   * @param psizesx Partition sizes on X
+   * @param psizesy Partition sizes on Y
+   * @param size Data set size
+   * @param gridsize Size of grids
+   * @param loggrid Logarithm of grid sizes, for normalization
+   * @return MCE score.
+   */
+  private double getMCEntropy(int[][] mat, int[] psizesx, int[] psizesy, int size, int gridsize, double loggrid) {
+    // Margin entropies:
+    double[] mx = new double[gridsize];
+    double[] my = new double[gridsize];
+
+    for(int i = 0; i < gridsize; i++) {
+      // Note: indexes are a bit tricky here, because we compute both margin
+      // entropies at the same time!
+      final double sumx = (double) psizesx[i];
+      final double sumy = (double) psizesy[i];
+      for(int j = 0; j < gridsize; j++) {
+        double px = mat[i][j] / sumx;
+        double py = mat[j][i] / sumy;
+
+        if(px > 0.) {
+          mx[i] -= px * Math.log(px);
+        }
+        if(py > 0.) {
+          my[i] -= py * Math.log(py);
+        }
+      }
+    }
+
+    // Weighted sums of margin entropies.
+    double sumx = 0., sumy = 0.;
+    for(int i = 0; i < gridsize; i++) {
+      sumx += mx[i] * psizesx[i];
+      sumy += my[i] * psizesy[i];
+    }
+
+    double max = ((sumx > sumy) ? sumx : sumy);
+    return max / (size * loggrid);
+  }
+
+  /**
    * Parameterization class.
    * 
    * @author Erich Schubert