diff options
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java | 141 |
1 files changed, 99 insertions, 42 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java index e43d9cc5..a96dfc91 100644 --- a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateCIndex.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; @@ -43,7 +44,9 @@ import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultHierarchy; import de.lmu.ifi.dbs.elki.result.ResultUtil; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; @@ -53,10 +56,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Compute the C-index of a data set. + * + * Note: This requires pairwise distance computations, so it is not recommended + * to use this on larger data sets. * * Reference: * <p> - * L. J. Hubert and J.R. Levin <br /> + * L. J. Hubert and J. R. Levin <br /> * A general statistical framework for assessing categorical clustering in free * recall<br /> * Psychological Bulletin, Vol. 83(6) @@ -64,6 +70,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @author Stephan Baier * @author Erich Schubert + * @since 0.7.0 * * @apiviz.composedOf NoiseHandling */ @@ -115,15 +122,9 @@ public class EvaluateCIndex<O> implements Evaluator { public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) { List<? extends Cluster<?>> clusters = c.getAllClusters(); - // theta is the sum, w the number of within group distances - double theta = 0; - int w = 0; - int ignorednoise = 0; - int isize = clusters.size() <= 1 ? rel.size() : rel.size() / (clusters.size() - 1); - DoubleArray pairDists = new DoubleArray(isize); - - for(int i = 0; i < clusters.size(); i++) { - Cluster<?> cluster = clusters.get(i); + // Count ignored noise, and within-cluster distances + int ignorednoise = 0, w = 0; + for(Cluster<?> cluster : clusters) { if(cluster.size() <= 1 || cluster.isNoise()) { switch(noiseOption){ case IGNORE_NOISE: @@ -135,41 +136,51 @@ public class EvaluateCIndex<O> implements Evaluator { break; // Treat like a cluster } } - for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) { - O obj = rel.get(it1); - // Compare object to every cluster, but only once - for(int j = i; j < clusters.size(); j++) { - Cluster<?> ocluster = clusters.get(j); - if(ocluster.size() <= 1 || ocluster.isNoise()) { - switch(noiseOption){ - case IGNORE_NOISE: - continue; // Ignore this cluster. - case TREAT_NOISE_AS_SINGLETONS: - case MERGE_NOISE: - break; // Treat like a cluster - } - } - for(DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) { - if(DBIDUtil.compare(it1, it2) <= 0) { // Only once. - continue; - } - double dist = dq.distance(obj, rel.get(it2)); - pairDists.add(dist); - if(ocluster == cluster) { // Within-cluster distances. - theta += dist; - w++; - } + w += (cluster.size() * (cluster.size() - 1)) >>> 1; + } + + double theta = 0.; // Sum of within-cluster distances + double min = 0, max = 0; // Sum of larges and smallest + if(w <= (rel.size() * (rel.size() - 1L)) >>> 2) { + DoubleHeap maxDists = new DoubleMinHeap(w); // Careful: REALLY minHeap! + DoubleHeap minDists = new DoubleMaxHeap(w); // Careful: REALLY maxHeap! + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null; + for(int i = 0; i < clusters.size(); i++) { + Cluster<?> cluster = clusters.get(i); + if(cluster.size() <= 1 || cluster.isNoise()) { + switch(noiseOption){ + case IGNORE_NOISE: + LOG.incrementProcessed(prog); + continue; // Ignore + case TREAT_NOISE_AS_SINGLETONS: + processSingleton(cluster, rel, dq, maxDists, minDists, w); + LOG.incrementProcessed(prog); + continue; + case MERGE_NOISE: + break; // Treat like a cluster, below } } + theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w); + LOG.incrementProcessed(prog); } - } + LOG.ensureCompleted(prog); - // Simulate best and worst cases: - pairDists.sort(); - double min = 0, max = 0; - for(int i = 0, j = pairDists.size() - 1; i < w; i++, j--) { - min += pairDists.get(i); - max += pairDists.get(j); + // Simulate best and worst cases: + assert (minDists.size() == w); + assert (maxDists.size() == w); + for(DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) { + min += it.get(); + } + for(DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) { + max += it.get(); + } + assert (max >= min); + } + else { + // Since we have fewer cross-cluster distances than within-cluster + // distances, min=max and cIndex = 0. + theta = min = max = 0; } double cIndex = (max > min) ? (theta - min) / (max - min) : 0.; @@ -189,6 +200,52 @@ public class EvaluateCIndex<O> implements Evaluator { return cIndex; } + protected double processCluster(Cluster<?> cluster, List<? extends Cluster<?>> clusters, int i, DistanceQuery<O> dq, DoubleHeap maxDists, DoubleHeap minDists, int w) { + double theta = 0.; + for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) { + // Compare object to every cluster, but only once + for(int j = i; j < clusters.size(); j++) { + Cluster<?> ocluster = clusters.get(j); + if(ocluster.size() <= 1 || ocluster.isNoise()) { + switch(noiseOption){ + case IGNORE_NOISE: + continue; // Ignore this cluster. + case TREAT_NOISE_AS_SINGLETONS: + break; // Treat like a cluster + case MERGE_NOISE: + break; // Treat like a cluster + } + } + for(DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) { + if(DBIDUtil.compare(it1, it2) <= 0) { // Only once. + continue; + } + double dist = dq.distance(it1, it2); + minDists.add(dist, w); + maxDists.add(dist, w); + if(ocluster == cluster) { // Within-cluster distances. + theta += dist; + } + } + } + } + return theta; + } + + protected void processSingleton(Cluster<?> cluster, Relation<? extends O> rel, DistanceQuery<O> dq, DoubleHeap maxDists, DoubleHeap minDists, int w) { + // All other objects are in other clusters! + for(DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) { + for(DBIDIter it2 = rel.iterDBIDs(); it2.valid(); it2.advance()) { + if(DBIDUtil.compare(it1, it2) <= 0) { // Only once. + continue; + } + double dist = dq.distance(it1, it2); + minDists.add(dist, w); + maxDists.add(dist, w); + } + } + } + @Override public void processNewResult(ResultHierarchy hier, Result result) { List<Clustering<?>> crs = ResultUtil.getClusteringResults(result); |