summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java191
1 files changed, 100 insertions, 91 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
index b3e6bb76..b5e9be6b 100644
--- a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -54,8 +54,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*/
-@Reference(authors = "D. Guo", title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", booktitle = "Information Visualization, 2(4)", url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
-public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+@Reference(authors = "D. Guo", //
+title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", //
+booktitle = "Information Visualization, 2(4)", //
+url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
+public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector> {
/**
* Static instance.
*/
@@ -77,11 +80,11 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
}
@Override
- public void computeDimensionSimilarites(Database database, Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ public void computeDimensionSimilarites(Database database, Relation<? extends NumberVector> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
final int dim = matrix.size();
// Find a number of bins as recommended by Cheng et al.
- double p = Math.log(subset.size() / (double) TARGET) / MathUtil.LOG2;
+ double p = MathUtil.log2(subset.size() / (double) TARGET);
// As we are in 2d, take the root (*.5) But let's use at least 1, too.
// Check: for 10000 this should give 4, for 150 it gives 1.
int power = Math.max(1, (int) Math.floor(p * .5));
@@ -92,18 +95,18 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
// Partition sizes
int[][] psizes = new int[dim][gridsize];
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
final ArrayList<DBIDs> partsd = parts.get(d);
final int[] sizesp = psizes[d];
- for (int i = 0; i < gridsize; i++) {
+ for(int i = 0; i < gridsize; i++) {
sizesp[i] = partsd.get(i).size();
}
}
int[][] res = new int[gridsize][gridsize];
- for (int x = 0; x < dim; x++) {
+ for(int x = 0; x < dim; x++) {
ArrayList<DBIDs> partsi = parts.get(x);
- for (int y = x + 1; y < dim; y++) {
+ for(int y = x + 1; y < dim; y++) {
ArrayList<DBIDs> partsj = parts.get(y);
// Fill the intersection matrix
intersectionMatrix(res, partsi, partsj, gridsize);
@@ -113,69 +116,6 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
}
/**
- * Intersect the two 1d grid decompositions, to obtain a 2d matrix.
- *
- * @param res Output matrix to fill
- * @param partsx Partitions in first component
- * @param partsy Partitions in second component.
- * @param gridsize Size of partition decomposition
- */
- private void intersectionMatrix(int[][] res, ArrayList<? extends DBIDs> partsx, ArrayList<? extends DBIDs> partsy, int gridsize) {
- for (int x = 0; x < gridsize; x++) {
- final DBIDs px = partsx.get(x);
- final int[] rowx = res[x];
- for (int y = 0; y < gridsize; y++) {
- rowx[y] = DBIDUtil.intersectionSize(px, partsy.get(y));
- }
- }
- }
-
- /**
- * Compute the MCE entropy value.
- *
- * @param mat Partition size matrix
- * @param psizesx Partition sizes on X
- * @param psizesy Partition sizes on Y
- * @param size Data set size
- * @param gridsize Size of grids
- * @param loggrid Logarithm of grid sizes, for normalization
- * @return MCE score.
- */
- private double getMCEntropy(int[][] mat, int[] psizesx, int[] psizesy, int size, int gridsize, double loggrid) {
- // Margin entropies:
- double[] mx = new double[gridsize];
- double[] my = new double[gridsize];
-
- for (int i = 0; i < gridsize; i++) {
- // Note: indexes are a bit tricky here, because we compute both margin
- // entropies at the same time!
- final double sumx = (double) psizesx[i];
- final double sumy = (double) psizesy[i];
- for (int j = 0; j < gridsize; j++) {
- double px = mat[i][j] / sumx;
- double py = mat[j][i] / sumy;
-
- if (px > 0.) {
- mx[i] -= px * Math.log(px);
- }
- if (py > 0.) {
- my[i] -= py * Math.log(py);
- }
- }
- }
-
- // Weighted sums of margin entropies.
- double sumx = 0., sumy = 0.;
- for (int i = 0; i < gridsize; i++) {
- sumx += mx[i] * psizesx[i];
- sumy += my[i] * psizesy[i];
- }
-
- double max = ((sumx > sumy) ? sumx : sumy);
- return max / (size * loggrid);
- }
-
- /**
* Calculates "index structures" for every attribute, i.e. sorts a
* ModifiableArray of every DBID in the database for every dimension and
* stores them in a list.
@@ -185,14 +125,14 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
* @param matrix Matrix for dimension information
* @return List of sorted objects
*/
- private ArrayList<ArrayList<DBIDs>> buildPartitions(Relation<? extends NumberVector<?>> relation, DBIDs ids, int depth, DimensionSimilarityMatrix matrix) {
+ private ArrayList<ArrayList<DBIDs>> buildPartitions(Relation<? extends NumberVector> relation, DBIDs ids, int depth, DimensionSimilarityMatrix matrix) {
final int dim = matrix.size();
ArrayList<ArrayList<DBIDs>> subspaceIndex = new ArrayList<>(dim);
SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
double[] tmp = new double[ids.size()];
Mean mean = new Mean();
-
- for (int i = 0; i < dim; i++) {
+
+ for(int i = 0; i < dim; i++) {
final int d = matrix.dim(i);
// Index for a single dimension:
ArrayList<DBIDs> idx = new ArrayList<>(1 << depth);
@@ -202,7 +142,7 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
sids.sort(comp);
// Now we build the temp array, and compute the first mean.
DBIDArrayIter it = sids.iter();
- for (int j = 0; j < tmp.length; j++, it.advance()) {
+ for(int j = 0; j < tmp.length; j++, it.advance()) {
assert (it.valid());
tmp[j] = relation.get(it).doubleValue(d);
}
@@ -210,7 +150,7 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
assert (idx.size() == (1 << depth));
subspaceIndex.add(idx);
}
-
+
return subspaceIndex;
}
@@ -227,44 +167,50 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
*/
private void divide(DBIDArrayIter it, double[] data, ArrayList<DBIDs> idx, int start, int end, int depth, Mean mean) {
final int count = end - start;
- if (depth == 0) {
- if (count > 0) {
+ if(depth == 0) {
+ if(count > 0) {
ModifiableDBIDs out = DBIDUtil.newHashSet(count);
it.seek(start);
- for (int i = count; i > 0; i--, it.advance()) {
+ for(int i = count; i > 0; i--, it.advance()) {
out.add(it);
}
idx.add(out);
- } else {
+ }
+ else {
idx.add(DBIDUtil.EMPTYDBIDS);
}
return;
- } else {
- if (count > 0) {
+ }
+ else {
+ if(count > 0) {
mean.reset();
- for (int i = start; i < end; i++) {
+ for(int i = start; i < end; i++) {
mean.put(data[i]);
}
final double m = mean.getMean();
int pos = Arrays.binarySearch(data, start, end, m);
- if (pos >= 0) {
+ if(pos >= 0) {
// Ties: try to choose the most central element.
int opt = (start + end) >> 1;
- while (Double.compare(data[pos], m) == 0) {
- if (pos < opt) {
+ while(Double.compare(data[pos], m) == 0) {
+ if(pos < opt) {
pos++;
- } else if (pos > opt) {
+ }
+ else if(pos > opt) {
pos--;
- } else {
+ }
+ else {
break;
}
}
- } else {
+ }
+ else {
pos = (-pos - 1);
}
divide(it, data, idx, start, pos, depth - 1, mean);
divide(it, data, idx, pos, end, depth - 1, mean);
- } else {
+ }
+ else {
// Corner case, that should barely happen. But for ties, we currently
// Do not yet assure that it doesn't happen!
divide(it, data, idx, start, end, depth - 1, mean);
@@ -274,6 +220,69 @@ public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<
}
/**
+ * Intersect the two 1d grid decompositions, to obtain a 2d matrix.
+ *
+ * @param res Output matrix to fill
+ * @param partsx Partitions in first component
+ * @param partsy Partitions in second component.
+ * @param gridsize Size of partition decomposition
+ */
+ private void intersectionMatrix(int[][] res, ArrayList<? extends DBIDs> partsx, ArrayList<? extends DBIDs> partsy, int gridsize) {
+ for(int x = 0; x < gridsize; x++) {
+ final DBIDs px = partsx.get(x);
+ final int[] rowx = res[x];
+ for(int y = 0; y < gridsize; y++) {
+ rowx[y] = DBIDUtil.intersectionSize(px, partsy.get(y));
+ }
+ }
+ }
+
+ /**
+ * Compute the MCE entropy value.
+ *
+ * @param mat Partition size matrix
+ * @param psizesx Partition sizes on X
+ * @param psizesy Partition sizes on Y
+ * @param size Data set size
+ * @param gridsize Size of grids
+ * @param loggrid Logarithm of grid sizes, for normalization
+ * @return MCE score.
+ */
+ private double getMCEntropy(int[][] mat, int[] psizesx, int[] psizesy, int size, int gridsize, double loggrid) {
+ // Margin entropies:
+ double[] mx = new double[gridsize];
+ double[] my = new double[gridsize];
+
+ for(int i = 0; i < gridsize; i++) {
+ // Note: indexes are a bit tricky here, because we compute both margin
+ // entropies at the same time!
+ final double sumx = (double) psizesx[i];
+ final double sumy = (double) psizesy[i];
+ for(int j = 0; j < gridsize; j++) {
+ double px = mat[i][j] / sumx;
+ double py = mat[j][i] / sumy;
+
+ if(px > 0.) {
+ mx[i] -= px * Math.log(px);
+ }
+ if(py > 0.) {
+ my[i] -= py * Math.log(py);
+ }
+ }
+ }
+
+ // Weighted sums of margin entropies.
+ double sumx = 0., sumy = 0.;
+ for(int i = 0; i < gridsize; i++) {
+ sumx += mx[i] * psizesx[i];
+ sumy += my[i] * psizesy[i];
+ }
+
+ double max = ((sumx > sumy) ? sumx : sumy);
+ return max / (size * loggrid);
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert