summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal
diff options
context:
space:
mode:
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java141
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateConcordantPairs.java3
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateDaviesBouldin.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSimplifiedSilhouette.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSquaredErrors.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateVarianceRatioCriteria.java1
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/NoiseHandling.java1
9 files changed, 108 insertions, 43 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
index e43d9cc5..a96dfc91 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.evaluation.Evaluator;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
@@ -43,7 +44,9 @@ import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup;
import de.lmu.ifi.dbs.elki.result.Result;
import de.lmu.ifi.dbs.elki.result.ResultHierarchy;
import de.lmu.ifi.dbs.elki.result.ResultUtil;
-import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -53,10 +56,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Compute the C-index of a data set.
+ *
+ * Note: This requires pairwise distance computations, so it is not recommended
+ * to use this on larger data sets.
*
* Reference:
* <p>
- * L. J. Hubert and J.R. Levin <br />
+ * L. J. Hubert and J. R. Levin <br />
* A general statistical framework for assessing categorical clustering in free
* recall<br />
* Psychological Bulletin, Vol. 83(6)
@@ -64,6 +70,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
@@ -115,15 +122,9 @@ public class EvaluateCIndex<O> implements Evaluator {
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
- // theta is the sum, w the number of within group distances
- double theta = 0;
- int w = 0;
- int ignorednoise = 0;
- int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1);
- DoubleArray pairDists = new DoubleArray(isize);
-
- for(int i = 0; i < clusters.size(); i++) {
- Cluster<?> cluster = clusters.get(i);
+ // Count ignored noise, and within-cluster distances
+ int ignorednoise = 0, w = 0;
+ for(Cluster<?> cluster : clusters) {
if(cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption){
case IGNORE_NOISE:
@@ -135,41 +136,51 @@ public class EvaluateCIndex<O> implements Evaluator {
break; // Treat like a cluster
}
}
- for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
- O obj = rel.get(it1);
- // Compare object to every cluster, but only once
- for(int j = i; j < clusters.size(); j++) {
- Cluster<?> ocluster = clusters.get(j);
- if(ocluster.size() <= 1 || ocluster.isNoise()) {
- switch(noiseOption){
- case IGNORE_NOISE:
- continue; // Ignore this cluster.
- case TREAT_NOISE_AS_SINGLETONS:
- case MERGE_NOISE:
- break; // Treat like a cluster
- }
- }
- for(DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
- if(DBIDUtil.compare(it1, it2) <= 0) { // Only once.
- continue;
- }
- double dist = dq.distance(obj, rel.get(it2));
- pairDists.add(dist);
- if(ocluster == cluster) { // Within-cluster distances.
- theta += dist;
- w++;
- }
+ w += (cluster.size() * (cluster.size() - 1)) >>> 1;
+ }
+
+ double theta = 0.; // Sum of within-cluster distances
+ double min = 0, max = 0; // Sum of larges and smallest
+ if(w <= (rel.size() * (rel.size() - 1L)) >>> 2) {
+ DoubleHeap maxDists = new DoubleMinHeap(w); // Careful: REALLY minHeap!
+ DoubleHeap minDists = new DoubleMaxHeap(w); // Careful: REALLY maxHeap!
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
+ for(int i = 0; i < clusters.size(); i++) {
+ Cluster<?> cluster = clusters.get(i);
+ if(cluster.size() <= 1 || cluster.isNoise()) {
+ switch(noiseOption){
+ case IGNORE_NOISE:
+ LOG.incrementProcessed(prog);
+ continue; // Ignore
+ case TREAT_NOISE_AS_SINGLETONS:
+ processSingleton(cluster, rel, dq, maxDists, minDists, w);
+ LOG.incrementProcessed(prog);
+ continue;
+ case MERGE_NOISE:
+ break; // Treat like a cluster, below
}
}
+ theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
+ LOG.incrementProcessed(prog);
}
- }
+ LOG.ensureCompleted(prog);
- // Simulate best and worst cases:
- pairDists.sort();
- double min = 0, max = 0;
- for(int i = 0, j = pairDists.size() - 1; i < w; i++, j--) {
- min += pairDists.get(i);
- max += pairDists.get(j);
+ // Simulate best and worst cases:
+ assert (minDists.size() == w);
+ assert (maxDists.size() == w);
+ for(DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
+ min += it.get();
+ }
+ for(DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
+ max += it.get();
+ }
+ assert (max >= min);
+ }
+ else {
+ // Since we have fewer cross-cluster distances than within-cluster
+ // distances, min=max and cIndex = 0.
+ theta = min = max = 0;
}
double cIndex = (max > min) ? (theta - min) / (max - min) : 0.;
@@ -189,6 +200,52 @@ public class EvaluateCIndex<O> implements Evaluator {
return cIndex;
}
+ protected double processCluster(Cluster<?> cluster, List<? extends Cluster<?>> clusters, int i, DistanceQuery<O> dq, DoubleHeap maxDists, DoubleHeap minDists, int w) {
+ double theta = 0.;
+ for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
+ // Compare object to every cluster, but only once
+ for(int j = i; j < clusters.size(); j++) {
+ Cluster<?> ocluster = clusters.get(j);
+ if(ocluster.size() <= 1 || ocluster.isNoise()) {
+ switch(noiseOption){
+ case IGNORE_NOISE:
+ continue; // Ignore this cluster.
+ case TREAT_NOISE_AS_SINGLETONS:
+ break; // Treat like a cluster
+ case MERGE_NOISE:
+ break; // Treat like a cluster
+ }
+ }
+ for(DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
+ if(DBIDUtil.compare(it1, it2) <= 0) { // Only once.
+ continue;
+ }
+ double dist = dq.distance(it1, it2);
+ minDists.add(dist, w);
+ maxDists.add(dist, w);
+ if(ocluster == cluster) { // Within-cluster distances.
+ theta += dist;
+ }
+ }
+ }
+ }
+ return theta;
+ }
+
+ protected void processSingleton(Cluster<?> cluster, Relation<? extends O> rel, DistanceQuery<O> dq, DoubleHeap maxDists, DoubleHeap minDists, int w) {
+ // All other objects are in other clusters!
+ for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
+ for(DBIDIter it2 = rel.iterDBIDs(); it2.valid(); it2.advance()) {
+ if(DBIDUtil.compare(it1, it2) <= 0) { // Only once.
+ continue;
+ }
+ double dist = dq.distance(it1, it2);
+ minDists.add(dist, w);
+ maxDists.add(dist, w);
+ }
+ }
+ }
+
@Override
public void processNewResult(ResultHierarchy hier, Result result) {
List<Clustering<?>> crs = ResultUtil.getClusteringResults(result);
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateConcordantPairs.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateConcordantPairs.java
index c5d2c199..119e9006 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateConcordantPairs.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateConcordantPairs.java
@@ -73,6 +73,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
@@ -312,7 +313,7 @@ public class EvaluateConcordantPairs<O> implements Evaluator {
/**
* Parameter for the option, how noise should be treated.
*/
- public static final OptionID NOISE_ID = new OptionID("davies-bouldin.noisehandling", "option, how noise should be treated.");
+ public static final OptionID NOISE_ID = new OptionID("concordant-pairs.noisehandling", "Control how noise should be treated.");
/**
* Distance function to use.
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateDaviesBouldin.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateDaviesBouldin.java
index ddeef150..d652a13a 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateDaviesBouldin.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateDaviesBouldin.java
@@ -63,6 +63,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* </p>
*
* @author Stephan Baier
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java
index 2c3a20e0..5a893887 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java
@@ -67,6 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java
index b40e6086..257ff1ea 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java
@@ -92,6 +92,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* TODO: keep all silhouette values, and allow visualization!
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSimplifiedSilhouette.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSimplifiedSilhouette.java
index a3f7e889..7ae8193c 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSimplifiedSilhouette.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSimplifiedSilhouette.java
@@ -60,6 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSquaredErrors.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSquaredErrors.java
index eec64fc4..0ea8cbf8 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSquaredErrors.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSquaredErrors.java
@@ -87,6 +87,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* by k-means.
*
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateVarianceRatioCriteria.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateVarianceRatioCriteria.java
index a48a410c..e8935593 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateVarianceRatioCriteria.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateVarianceRatioCriteria.java
@@ -65,6 +65,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*
* @apiviz.composedOf NoiseHandling
*/
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/NoiseHandling.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/NoiseHandling.java
index 7145ed94..759636d8 100644
--- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/NoiseHandling.java
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/NoiseHandling.java
@@ -28,6 +28,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.internal;
*
* @author Stephan Baier
* @author Erich Schubert
+ * @since 0.7.0
*/
public enum NoiseHandling {
/** Merge all noise into a cluster */