summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
diff options
context:
space:
mode:
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java141
1 files changed, 99 insertions, 42 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
index e43d9cc5..a96dfc91 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.evaluation.Evaluator;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
@@ -43,7 +44,9 @@ import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup;
import de.lmu.ifi.dbs.elki.result.Result;
import de.lmu.ifi.dbs.elki.result.ResultHierarchy;
import de.lmu.ifi.dbs.elki.result.ResultUtil;
-import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -53,10 +56,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Compute the C-index of a data set.
+ *
+ * Note: This requires pairwise distance computations, so it is not recommended
+ * to use this on larger data sets.
*
* Reference:
* <p>
- * L. J. Hubert and J.R. Levin <br />
+ * L. J. Hubert and J. R. Levin <br />
* A general statistical framework for assessing categorical clustering in free
* recall<br />
* Psychological Bulletin, Vol. 83(6)
@@ -64,6 +70,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
@@ -115,15 +122,9 @@ public class EvaluateCIndex<O> implements Evaluator {
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
- // theta is the sum, w the number of within group distances
- double theta = 0;
- int w = 0;
- int ignorednoise = 0;
- int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1);
- DoubleArray pairDists = new DoubleArray(isize);
-
- for(int i = 0; i < clusters.size(); i++) {
- Cluster<?> cluster = clusters.get(i);
+ // Count ignored noise, and within-cluster distances
+ int ignorednoise = 0, w = 0;
+ for(Cluster<?> cluster : clusters) {
if(cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption){
case IGNORE_NOISE:
@@ -135,41 +136,51 @@ public class EvaluateCIndex<O> implements Evaluator {
break; // Treat like a cluster
}
}
- for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
- O obj = rel.get(it1);
- // Compare object to every cluster, but only once
- for(int j = i; j < clusters.size(); j++) {
- Cluster<?> ocluster = clusters.get(j);
- if(ocluster.size() <= 1 || ocluster.isNoise()) {
- switch(noiseOption){
- case IGNORE_NOISE:
- continue; // Ignore this cluster.
- case TREAT_NOISE_AS_SINGLETONS:
- case MERGE_NOISE:
- break; // Treat like a cluster
- }
- }
- for(DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
- if(DBIDUtil.compare(it1, it2) <= 0) { // Only once.
- continue;
- }
- double dist = dq.distance(obj, rel.get(it2));
- pairDists.add(dist);
- if(ocluster == cluster) { // Within-cluster distances.
- theta += dist;
- w++;
- }
+ w += (cluster.size() * (cluster.size() - 1)) >>> 1;
+ }
+
+ double theta = 0.; // Sum of within-cluster distances
+ double min = 0, max = 0; // Sum of larges and smallest
+ if(w <= (rel.size() * (rel.size() - 1L)) >>> 2) {
+ DoubleHeap maxDists = new DoubleMinHeap(w); // Careful: REALLY minHeap!
+ DoubleHeap minDists = new DoubleMaxHeap(w); // Careful: REALLY maxHeap!
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
+ for(int i = 0; i < clusters.size(); i++) {
+ Cluster<?> cluster = clusters.get(i);
+ if(cluster.size() <= 1 || cluster.isNoise()) {
+ switch(noiseOption){
+ case IGNORE_NOISE:
+ LOG.incrementProcessed(prog);
+ continue; // Ignore
+ case TREAT_NOISE_AS_SINGLETONS:
+ processSingleton(cluster, rel, dq, maxDists, minDists, w);
+ LOG.incrementProcessed(prog);
+ continue;
+ case MERGE_NOISE:
+ break; // Treat like a cluster, below
}
}
+ theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
+ LOG.incrementProcessed(prog);
}
- }
+ LOG.ensureCompleted(prog);
- // Simulate best and worst cases:
- pairDists.sort();
- double min = 0, max = 0;
- for(int i = 0, j = pairDists.size() - 1; i < w; i++, j--) {
- min += pairDists.get(i);
- max += pairDists.get(j);
+ // Simulate best and worst cases:
+ assert (minDists.size() == w);
+ assert (maxDists.size() == w);
+ for(DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
+ min += it.get();
+ }
+ for(DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
+ max += it.get();
+ }
+ assert (max >= min);
+ }
+ else {
+ // Since we have fewer cross-cluster distances than within-cluster
+ // distances, min=max and cIndex = 0.
+ theta = min = max = 0;
}
double cIndex = (max > min) ? (theta - min) / (max - min) : 0.;
@@ -189,6 +200,52 @@ public class EvaluateCIndex<O> implements Evaluator {
return cIndex;
}
+ protected double processCluster(Cluster<?> cluster, List<? extends Cluster<?>> clusters, int i, DistanceQuery<O> dq, DoubleHeap maxDists, DoubleHeap minDists, int w) {
+ double theta = 0.;
+ for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
+ // Compare object to every cluster, but only once
+ for(int j = i; j < clusters.size(); j++) {
+ Cluster<?> ocluster = clusters.get(j);
+ if(ocluster.size() <= 1 || ocluster.isNoise()) {
+ switch(noiseOption){
+ case IGNORE_NOISE:
+ continue; // Ignore this cluster.
+ case TREAT_NOISE_AS_SINGLETONS:
+ break; // Treat like a cluster
+ case MERGE_NOISE:
+ break; // Treat like a cluster
+ }
+ }
+ for(DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
+ if(DBIDUtil.compare(it1, it2) <= 0) { // Only once.
+ continue;
+ }
+ double dist = dq.distance(it1, it2);
+ minDists.add(dist, w);
+ maxDists.add(dist, w);
+ if(ocluster == cluster) { // Within-cluster distances.
+ theta += dist;
+ }
+ }
+ }
+ }
+ return theta;
+ }
+
+ protected void processSingleton(Cluster<?> cluster, Relation<? extends O> rel, DistanceQuery<O> dq, DoubleHeap maxDists, DoubleHeap minDists, int w) {
+ // All other objects are in other clusters!
+ for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
+ for(DBIDIter it2 = rel.iterDBIDs(); it2.valid(); it2.advance()) {
+ if(DBIDUtil.compare(it1, it2) <= 0) { // Only once.
+ continue;
+ }
+ double dist = dq.distance(it1, it2);
+ minDists.add(dist, w);
+ maxDists.add(dist, w);
+ }
+ }
+ }
+
@Override
public void processNewResult(ResultHierarchy hier, Result result) {
List<Clustering<?>> crs = ResultUtil.getClusteringResults(result);