diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation/clustering')
14 files changed, 480 insertions, 165 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java index 3353b593..1c84f0e1 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java index c3f8129f..d5dec735 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,16 +22,15 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.BitSet; import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; /** * Class storing the contingency table and related data on two clusterings. @@ -59,14 +58,9 @@ public class ClusterContingencyTable { protected boolean selfPairing = true; /** - * Number of clusters in first + * Number of clusters. */ - protected int size1 = -1; - - /** - * Number of clusters in second - */ - protected int size2 = -1; + protected int size1 = -1, size2 = -1; /** * Contingency matrix @@ -76,12 +70,7 @@ public class ClusterContingencyTable { /** * Noise flags */ - protected BitSet noise1 = null; - - /** - * Noise flags - */ - protected BitSet noise2 = null; + protected long[] noise1 = null, noise2 = null; /** * Pair counting measures @@ -135,46 +124,39 @@ public class ClusterContingencyTable { size1 = cs1.size(); size2 = cs2.size(); contingency = new int[size1 + 2][size2 + 2]; - noise1 = new BitSet(size1); - noise2 = new BitSet(size2); + noise1 = BitsUtil.zero(size1); + noise2 = BitsUtil.zero(size2); // Fill main part of matrix { - { - final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); - for(int i2 = 0; it2.hasNext(); i2++) { - final Cluster<?> c2 = it2.next(); - if(c2.isNoise()) { - noise2.set(i2); - } - contingency[size1 + 1][i2] = c2.size(); - contingency[size1 + 1][size2] += c2.size(); + final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); + for(int i2 = 0; it2.hasNext(); i2++) { + final Cluster<?> c2 = it2.next(); + if(c2.isNoise()) { + BitsUtil.setI(noise2, i2); } + contingency[size1 + 1][i2] = c2.size(); + contingency[size1 + 1][size2] += c2.size(); } - final Iterator<? extends Cluster<?>> it1 = cs1.iterator(); - for(int i1 = 0; it1.hasNext(); i1++) { - final Cluster<?> c1 = it1.next(); - if(c1.isNoise()) { - noise1.set(i1); - } - final DBIDs ids = DBIDUtil.ensureSet(c1.getIDs()); - contingency[i1][size2 + 1] = c1.size(); - contingency[size1][size2 + 1] += c1.size(); - - final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); - for(int i2 = 0; it2.hasNext(); i2++) { - final Cluster<?> c2 = it2.next(); - int count = 0; - for(DBIDIter iter = c2.getIDs().iter(); iter.valid(); iter.advance()) { - if(ids.contains(iter)) { - count++; - } - } - contingency[i1][i2] = count; - contingency[i1][size2] += count; - contingency[size1][i2] += count; - contingency[size1][size2] += count; - } + } + final Iterator<? extends Cluster<?>> it1 = cs1.iterator(); + for(int i1 = 0; it1.hasNext(); i1++) { + final Cluster<?> c1 = it1.next(); + if(c1.isNoise()) { + BitsUtil.setI(noise1, i1); + } + final DBIDs ids = DBIDUtil.ensureSet(c1.getIDs()); + contingency[i1][size2 + 1] = c1.size(); + contingency[size1][size2 + 1] += c1.size(); + + final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); + for(int i2 = 0; it2.hasNext(); i2++) { + final Cluster<?> c2 = it2.next(); + int count = DBIDUtil.intersectionSize(ids, c2.getIDs()); + contingency[i1][i2] = count; + contingency[i1][size2] += count; + contingency[size1][i2] += count; + contingency[size1][size2] += count; } } } @@ -196,9 +178,6 @@ public class ClusterContingencyTable { buf.append('\n'); } } - // if(pairconfuse != null) { - // buf.append(FormatUtil.format(pairconfuse)); - // } return buf.toString(); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java index 6bbda98a..41b69de6 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java index 58f19f65..654cecbe 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java @@ -1,9 +1,10 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -36,7 +37,10 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Sascha Goldhofer */ -@Reference(authors = "Meilă, M.", title = "Comparing clusterings by the variation of information", booktitle = "Learning theory and kernel machines", url = "http://dx.doi.org/10.1007/978-3-540-45167-9_14") +@Reference(authors = "Meilă, M.", // +title = "Comparing clusterings by the variation of information", // +booktitle = "Learning theory and kernel machines", // +url = "http://dx.doi.org/10.1007/978-3-540-45167-9_14") public class Entropy { /** * Entropy in first @@ -105,8 +109,8 @@ public class Entropy { } /** - * Get the entropy of the second clustering using Log_2. (not normalized, 0 - * = equal) + * Get the entropy of the second clustering using Log_2. (not normalized, 0 = + * equal) * * @return Entropy of second clustering */ @@ -144,8 +148,7 @@ public class Entropy { } /** - * Get Powers entropy (normalized, 0 = equal) - * Powers = 1 - NMI_Sum + * Get Powers entropy (normalized, 0 = equal) Powers = 1 - NMI_Sum * * @return Powers */ @@ -168,7 +171,7 @@ public class Entropy { * @return Joint Normalized Mutual information */ public double entropyNMIJoint() { - if (entropyJoint() == 0) { + if(entropyJoint() == 0) { return 0; } return (entropyMutualInformation() / entropyJoint()); @@ -207,7 +210,7 @@ public class Entropy { * @return Sqrt Normalized Mutual information */ public double entropyNMISqrt() { - if (entropyFirst() * entropySecond() <= 0) { + if(entropyFirst() * entropySecond() <= 0) { return entropyMutualInformation(); } return (entropyMutualInformation() / Math.sqrt(entropyFirst() * entropySecond())); @@ -223,20 +226,23 @@ public class Entropy { } /** - * Get the normalized variation of information (normalized, 0 = equal) - * NVI = 1 - NMI_Joint + * Get the normalized variation of information (normalized, 0 = equal) NVI = 1 + * - NMI_Joint * * <p> - * Vinh, N.X. and Epps, J. and Bailey, J.<br /> - * Information theoretic measures for clusterings comparison: is a - * correction for chance necessary?<br /> - * In: Proc. ICML '09 Proceedings of the 26th Annual International - * Conference on Machine Learning + * Nguyen, X. V. and Epps, J. and Bailey, J.<br /> + * Information theoretic measures for clusterings comparison: is a correction + * for chance necessary?<br /> + * In: Proc. ICML '09 Proceedings of the 26th Annual International Conference + * on Machine Learning * </p> * * @return Normalized Variation of information */ - @Reference(authors = "Vinh, N.X. and Epps, J. and Bailey, J.", title = "Information theoretic measures for clusterings comparison: is a correction for chance necessary?", booktitle = "Proc. ICML '09 Proceedings of the 26th Annual International Conference on Machine Learning", url = "http://dx.doi.org/10.1145/1553374.1553511") + @Reference(authors = "Nguyen, X. V. and Epps, J. and Bailey, J.", // + title = "Information theoretic measures for clusterings comparison: is a correction for chance necessary?", // + booktitle = "Proc. ICML '09 Proceedings of the 26th Annual International Conference on Machine Learning", // + url = "http://dx.doi.org/10.1145/1553374.1553511") public double normalizedVariationOfInformation() { return (1.0 - (entropyMutualInformation() / entropyJoint())); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java index 6db25736..2a67dbe2 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,12 +32,12 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.result.BasicResult; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.EvaluationResult; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultUtil; -import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; -import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -156,7 +156,9 @@ public class EvaluateClustering implements Evaluator { ClusterContingencyTable contmat = new ClusterContingencyTable(selfPairing, noiseSpecialHandling); contmat.process(refc, c); - db.getHierarchy().add(c, new ScoreResult(contmat)); + ScoreResult sr = new ScoreResult(contmat); + sr.addHeader(c.getLongName()); + db.getHierarchy().add(c, sr); } } @@ -184,7 +186,7 @@ public class EvaluateClustering implements Evaluator { * * @apiviz.composedOf ClusterContingencyTable */ - public static class ScoreResult extends BasicResult implements TextWriteable { + public static class ScoreResult extends EvaluationResult { /** * Cluster contingency table */ @@ -198,6 +200,43 @@ public class EvaluateClustering implements Evaluator { public ScoreResult(ClusterContingencyTable contmat) { super("Cluster-Evalation", "cluster-evaluation"); this.contmat = contmat; + + PairCounting paircount = contmat.getPaircount(); + MeasurementGroup g = newGroup("Pair counting measures"); + g.addMeasure("Jaccard", paircount.jaccard(), 0, 1, false); + g.addMeasure("F1-Measure", paircount.f1Measure(), 0, 1, false); + g.addMeasure("Precision", paircount.precision(), 0, 1, false); + g.addMeasure("Recall", paircount.recall(), 0, 1, false); + g.addMeasure("Rand", paircount.randIndex(), 0, 1, false); + g.addMeasure("ARI", paircount.adjustedRandIndex(), 0, 1, false); + g.addMeasure("FowlkesMallows", paircount.fowlkesMallows(), 0, 1, false); + + Entropy entropy = contmat.getEntropy(); + g = newGroup("Entropy based measures"); + g.addMeasure("NMI Joint", entropy.entropyNMIJoint(), 0, 1, false); + g.addMeasure("NMI Sqrt", entropy.entropyNMISqrt(), 0, 1, false); + + BCubed bcubed = contmat.getBCubed(); + g = newGroup("BCubed-based measures"); + g.addMeasure("F1-Measure", bcubed.f1Measure(), 0, 1, false); + g.addMeasure("Recall", bcubed.recall(), 0, 1, false); + g.addMeasure("Precision", bcubed.precision(), 0, 1, false); + + SetMatchingPurity setm = contmat.getSetMatching(); + g = newGroup("Set-Matching-based measures"); + g.addMeasure("F1-Measure", setm.f1Measure(), 0, 1, false); + g.addMeasure("Purity", setm.purity(), 0, 1, false); + g.addMeasure("Inverse Purity", setm.inversePurity(), 0, 1, false); + + EditDistance edit = contmat.getEdit(); + g = newGroup("Editing-distance measures"); + g.addMeasure("F1-Measure", edit.f1Measure(), 0, 1, false); + g.addMeasure("Precision", edit.editDistanceFirst(), 0, 1, false); + g.addMeasure("Recall", edit.editDistanceSecond(), 0, 1, false); + + MeanVariance gini = contmat.averageSymmetricGini(); + g = newGroup("Gini measures"); + g.addMeasure("Mean +-" + FormatUtil.NF4.format(gini.getCount() > 1. ? gini.getSampleStddev() : 0.), gini.getMean(), 0, 1, false); } /** @@ -208,47 +247,6 @@ public class EvaluateClustering implements Evaluator { public ClusterContingencyTable getContingencyTable() { return contmat; } - - @Override - public void writeToText(TextWriterStream out, String label) { - out.commentPrint("Pair-F1, "); - out.commentPrint("Pair-Precision, "); - out.commentPrint("Pair-Recall, "); - out.commentPrint("Pair-Rand, "); - out.commentPrint("Pair-AdjustedRand, "); - out.commentPrint("Pair-FowlkesMallows, "); - out.commentPrint("Pair-Jaccard, "); - out.commentPrint("Pair-Mirkin, "); - out.commentPrint("Entropy-VI, "); - out.commentPrint("Entropy-NormalizedVI, "); - out.commentPrint("Entropy-F1, "); - out.commentPrint("Edit-F1, "); - out.commentPrint("SM-InvPurity, "); - out.commentPrint("SM-Purity, "); - out.commentPrint("SM-F1, "); - out.commentPrint("BCubed-Precision, "); - out.commentPrint("BCubed-Recall, "); - out.commentPrint("BCubed-F1"); - out.flush(); - out.inlinePrint(contmat.getPaircount().f1Measure()); - out.inlinePrint(contmat.getPaircount().precision()); - out.inlinePrint(contmat.getPaircount().recall()); - out.inlinePrint(contmat.getPaircount().randIndex()); - out.inlinePrint(contmat.getPaircount().adjustedRandIndex()); - out.inlinePrint(contmat.getPaircount().fowlkesMallows()); - out.inlinePrint(contmat.getPaircount().jaccard()); - out.inlinePrint(contmat.getPaircount().mirkin()); - out.inlinePrint(contmat.getEntropy().variationOfInformation()); - out.inlinePrint(contmat.getEntropy().normalizedVariationOfInformation()); - out.inlinePrint(contmat.getEdit().f1Measure()); - out.inlinePrint(contmat.getSetMatching().inversePurity()); - out.inlinePrint(contmat.getSetMatching().purity()); - out.inlinePrint(contmat.getSetMatching().f1Measure()); - out.inlinePrint(contmat.getBCubed().precision()); - out.inlinePrint(contmat.getBCubed().recall()); - out.inlinePrint(contmat.getBCubed().f1Measure()); - out.flush(); - } } /** diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java index ef7935d8..cbe8e618 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,6 +24,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; */ import de.lmu.ifi.dbs.elki.logging.LoggingUtil; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; /** @@ -54,16 +55,18 @@ public class PairCounting { long inBoth = 0, in1 = 0, in2 = 0, total = 0; // Process first clustering: { - for (int i1 = 0; i1 < table.size1; i1++) { + for(int i1 = 0; i1 < table.size1; i1++) { final int size = table.contingency[i1][table.size2 + 1]; - if (table.breakNoiseClusters && table.noise1.get(i1)) { - if (table.selfPairing) { + if(table.breakNoiseClusters && BitsUtil.get(table.noise1, i1)) { + if(table.selfPairing) { in1 += size; } // else: 0 - } else { - if (table.selfPairing) { + } + else { + if(table.selfPairing) { in1 += size * size; - } else { + } + else { in1 += size * (size - 1); } } @@ -71,33 +74,37 @@ public class PairCounting { } // Process second clustering: { - for (int i2 = 0; i2 < table.size2; i2++) { + for(int i2 = 0; i2 < table.size2; i2++) { final int size = table.contingency[table.size1 + 1][i2]; - if (table.breakNoiseClusters && table.noise2.get(i2)) { - if (table.selfPairing) { + if(table.breakNoiseClusters && BitsUtil.get(table.noise2, i2)) { + if(table.selfPairing) { in2 += size; } // else: 0 - } else { - if (table.selfPairing) { + } + else { + if(table.selfPairing) { in2 += size * size; - } else { + } + else { in2 += size * (size - 1); } } } } // Process combinations - for (int i1 = 0; i1 < table.size1; i1++) { - for (int i2 = 0; i2 < table.size2; i2++) { + for(int i1 = 0; i1 < table.size1; i1++) { + for(int i2 = 0; i2 < table.size2; i2++) { final int size = table.contingency[i1][i2]; - if (table.breakNoiseClusters && (table.noise1.get(i1) || table.noise2.get(i2))) { - if (table.selfPairing) { + if(table.breakNoiseClusters && (BitsUtil.get(table.noise1, i1) || BitsUtil.get(table.noise2, i2))) { + if(table.selfPairing) { inBoth += size; } // else: 0 - } else { - if (table.selfPairing) { + } + else { + if(table.selfPairing) { inBoth += size * size; - } else { + } + else { inBoth += size * (size - 1); } } @@ -105,15 +112,16 @@ public class PairCounting { } // The official sum int tsize = table.contingency[table.size1][table.size2]; - if (table.contingency[table.size1][table.size2 + 1] != tsize || table.contingency[table.size1 + 1][table.size2] != tsize) { + if(table.contingency[table.size1][table.size2 + 1] != tsize || table.contingency[table.size1 + 1][table.size2] != tsize) { LoggingUtil.warning("PairCounting F-Measure is not well defined for overlapping and incomplete clusterings. The number of elements are: " + table.contingency[table.size1][table.size2 + 1] + " != " + table.contingency[table.size1 + 1][table.size2] + " elements."); } - if (tsize < 0 || tsize >= MAX_SIZE) { + if(tsize < 0 || tsize >= MAX_SIZE) { LoggingUtil.warning("Your data set size probably is too big for this implementation, which uses only long precision."); } - if (table.selfPairing) { + if(table.selfPairing) { total = tsize * tsize; - } else { + } + else { total = tsize * (tsize - 1); } long inFirst = in1 - inBoth, inSecond = in2 - inBoth; @@ -172,7 +180,9 @@ public class PairCounting { * @return pair-counting Fowlkes-mallows */ // TODO: implement for non-flat clusterings! - @Reference(authors = "Fowlkes, E.B. and Mallows, C.L.", title = "A method for comparing two hierarchical clusterings", booktitle = "Journal of the American Statistical Association, Vol. 78 Issue 383") + @Reference(authors = "Fowlkes, E.B. and Mallows, C.L.", // + title = "A method for comparing two hierarchical clusterings", // + booktitle = "Journal of the American Statistical Association, Vol. 78 Issue 383") public double fowlkesMallows() { return Math.sqrt(precision() * recall()); } @@ -188,7 +198,10 @@ public class PairCounting { * * @return The Rand index (RI). */ - @Reference(authors = "Rand, W. M.", title = "Objective Criteria for the Evaluation of Clustering Methods", booktitle = "Journal of the American Statistical Association, Vol. 66 Issue 336", url = "http://www.jstor.org/stable/10.2307/2284239") + @Reference(authors = "Rand, W. M.", // + title = "Objective Criteria for the Evaluation of Clustering Methods", // + booktitle = "Journal of the American Statistical Association, Vol. 66 Issue 336", // + url = "http://www.jstor.org/stable/10.2307/2284239") public double randIndex() { final double sum = pairconfuse[0] + pairconfuse[1] + pairconfuse[2] + pairconfuse[3]; return (pairconfuse[0] + pairconfuse[3]) / sum; @@ -203,9 +216,10 @@ public class PairCounting { final double nom = pairconfuse[0] * pairconfuse[3] - pairconfuse[1] * pairconfuse[2]; final long d1 = (pairconfuse[0] + pairconfuse[1]) * (pairconfuse[1] + pairconfuse[3]); final long d2 = (pairconfuse[0] + pairconfuse[2]) * (pairconfuse[2] + pairconfuse[3]); - if (d1 + d2 > 0) { + if(d1 + d2 > 0) { return 2 * nom / (d1 + d2); - } else { + } + else { return 1.; } } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java index 1a003117..d37d4693 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java @@ -1,9 +1,10 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -41,19 +42,29 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * University of Washington, Seattle, Technical Report 418, 2002 * </p> * <p> - * Steinbach, M. and Karypis, G. and Kumar, V. and others<br /> + * Steinbach, M. and Karypis, G. and Kumar, V.<br /> * A comparison of document clustering techniques<br /> * KDD workshop on text mining, 2000 * </p> + * <p> + * E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo <br /> + * A comparison of extrinsic clustering evaluation metrics based on formal + * constraints<br /> + * Inf. Retrieval, vol. 12, no. 5, pp. 461–486, 2009 + * </p> * * @author Sascha Goldhofer */ -@Reference(authors = "Meilă, M", title = "Comparing clusterings", booktitle = "University of Washington, Seattle, Technical Report 418, 2002", url = "http://www.stat.washington.edu/mmp/www.stat.washington.edu/mmp/Papers/compare-colt.pdf") +@Reference(authors = "Meilă, M", // +title = "Comparing clusterings", // +booktitle = "University of Washington, Seattle, Technical Report 418, 2002", // +url = "http://www.stat.washington.edu/mmp/Papers/compare-colt.pdf") public class SetMatchingPurity { /** * Result cache */ - protected double smPurity = -1.0, smInversePurity = -1.0, smFFirst = -1.0, smFSecond = - 1.0; + protected double smPurity = -1.0, smInversePurity = -1.0, smFFirst = -1.0, + smFSecond = -1.0; /** * Constructor. @@ -105,14 +116,17 @@ public class SetMatchingPurity { * * @return purity */ - @Reference(authors = "Zhao, Y. and Karypis, G.", title = "Criterion functions for document clustering: Experiments and analysis", booktitle = "University of Minnesota, Department of Computer Science, Technical Report 01-40, 2001", url = "http://www-users.cs.umn.edu/~karypis/publications/Papers/PDF/vscluster.pdf") + @Reference(authors = "Zhao, Y. and Karypis, G.", // + title = "Criterion functions for document clustering: Experiments and analysis", // + booktitle = "University of Minnesota, Department of Computer Science, Technical Report 01-40, 2001", // + url = "http://www-users.cs.umn.edu/~karypis/publications/Papers/PDF/vscluster.pdf") public double purity() { return smPurity; } /** - * Get the set matchings inverse purity (second:first clustering) - * (normalized, 1 = equal) + * Get the set matchings inverse purity (second:first clustering) (normalized, + * 1 = equal) * * @return Inverse purity */ @@ -124,44 +138,57 @@ public class SetMatchingPurity { * Get the set matching F1-Measure * * <p> - * Steinbach, M. and Karypis, G. and Kumar, V. and others<br /> + * Steinbach, M. and Karypis, G. and Kumar, V.<br /> * A comparison of document clustering techniques<br /> * KDD workshop on text mining, 2000 * </p> * * @return Set Matching F1-Measure */ - @Reference(authors = "Steinbach, M. and Karypis, G. and Kumar, V. and others", title = "A comparison of document clustering techniques", booktitle = "KDD workshop on text mining, 2000", url = "http://www-users.itlabs.umn.edu/~karypis/publications/Papers/PDF/doccluster.pdf") + @Reference(authors = "Steinbach, M. and Karypis, G. and Kumar, V.", // + title = "A comparison of document clustering techniques", // + booktitle = "KDD workshop on text mining, 2000", // + url = "http://www-users.itlabs.umn.edu/~karypis/publications/Papers/PDF/doccluster.pdf") public double f1Measure() { return Util.f1Measure(purity(), inversePurity()); } - + /** * Get the Van Rijsbergen’s F measure (asymmetric) for first clustering * * <p> * E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo <br /> - * A comparison of extrinsic clustering evaluation metrics based on formal constraints<br /> + * A comparison of extrinsic clustering evaluation metrics based on formal + * constraints<br /> * Inf. Retrieval, vol. 12, no. 5, pp. 461–486, 2009 * </p> * * @return Set Matching F-Measure of first clustering */ + @Reference(authors = "E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo", // + title = "A comparison of extrinsic clustering evaluation metrics based on formal constraints", // + booktitle = "Inf. Retrieval, vol. 12, no. 5", // + url = "http://dx.doi.org/10.1007/s10791-009-9106-z") public double fMeasureFirst() { return smFFirst; } - + /** * Get the Van Rijsbergen’s F measure (asymmetric) for second clustering * * <p> * E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo <br /> - * A comparison of extrinsic clustering evaluation metrics based on formal constraints<br /> + * A comparison of extrinsic clustering evaluation metrics based on formal + * constraints<br /> * Inf. Retrieval, vol. 12, no. 5, pp. 461–486, 2009 * </p> * * @return Set Matching F-Measure of second clustering */ + @Reference(authors = "E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo", // + title = "A comparison of extrinsic clustering evaluation metrics based on formal constraints", // + booktitle = "Inf. Retrieval, vol. 12, no. 5", // + url = "http://dx.doi.org/10.1007/s10791-009-9106-z") public double fMeasureSecond() { return smFSecond; } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java new file mode 100644 index 00000000..bc2817a6 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java @@ -0,0 +1,265 @@ +package de.lmu.ifi.dbs.elki.evaluation.clustering.internal; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.EvaluationResult; +import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Compute the silhouette of a data set. + * + * Reference: + * <p> + * P. J. Rousseeuw<br /> + * Silhouettes: A graphical aid to the interpretation and validation of cluster + * analysis<br /> + * In: Journal of Computational and Applied Mathematics Volume 20, November 1987 + * </p> + * + * TODO: keep all silhouette values, and allow visualization! + * + * @author Erich Schubert + * + * @param <O> Object type + */ +@Reference(authors = "P. J. Rousseeuw", // +title = "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis", // +booktitle = "Journal of Computational and Applied Mathematics, Volume 20", // +url = "http://dx.doi.org/10.1016%2F0377-0427%2887%2990125-7") +public class EvaluateSilhouette<O> implements Evaluator { + /** + * Logger for debug output. + */ + private static final Logging LOG = Logging.getLogger(EvaluateSilhouette.class); + + /** + * Keep noise "clusters" merged, instead of breaking them into singletons. + */ + private boolean mergenoise = false; + + /** + * Distance function to use. + */ + private DistanceFunction<? super O> distance; + + /** + * Key for logging statistics. + */ + private String key = EvaluateSilhouette.class.getName(); + + /** + * Constructor. + * + * @param distance Distance function + * @param mergenoise Flag to treat noise as clusters, instead of breaking them + * into singletons. + */ + public EvaluateSilhouette(DistanceFunction<? super O> distance, boolean mergenoise) { + super(); + this.distance = distance; + this.mergenoise = mergenoise; + } + + /** + * Evaluate a single clustering. + * + * @param db Database + * @param rel Data relation + * @param dq Distance query + * @param c Clustering + */ + public void evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) { + List<? extends Cluster<?>> clusters = c.getAllClusters(); + MeanVariance msil = new MeanVariance(); + for(Cluster<?> cluster : clusters) { + if(cluster.size() <= 1 || (!mergenoise && cluster.isNoise())) { + // As suggested in Rousseeuw, we use 0 for singletons. + msil.put(0., cluster.size()); + continue; + } + ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs()); + double[] as = new double[ids.size()]; // temporary storage. + DBIDArrayIter it1 = ids.iter(), it2 = ids.iter(); + for(it1.seek(0); it1.valid(); it1.advance()) { + // a: In-cluster distances + double a = as[it1.getOffset()]; // Already computed distances + for(it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) { + final double dist = dq.distance(it1, it2); + a += dist; + as[it2.getOffset()] += dist; + } + a /= (ids.size() - 1); + // b: other clusters: + double min = Double.POSITIVE_INFINITY; + for(Cluster<?> ocluster : clusters) { + if(ocluster == /* yes, reference identity */cluster) { + continue; + } + if(!mergenoise && ocluster.isNoise()) { + // Treat noise cluster as singletons: + for(DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) { + double dist = dq.distance(it1, it3); + if(dist < min) { + min = dist; + } + } + continue; + } + final DBIDs oids = ocluster.getIDs(); + double b = 0.; + for(DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) { + b += dq.distance(it1, it3); + } + b /= oids.size(); + if(b < min) { + min = b; + } + } + msil.put((min - a) / Math.max(min, a)); + } + } + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(key + ".silhouette.noise", mergenoise ? 1L : 0L)); + LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", msil.getMean())); + LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", msil.getSampleStddev())); + } + + EvaluationResult ev = new EvaluationResult("Internal Clustering Evaluation", "internal evaluation"); + MeasurementGroup g = ev.newGroup("Distance-based Evaluation"); + g.addMeasure("Silhouette coefficient +-" + FormatUtil.NF2.format(msil.getSampleStddev()), msil.getMean(), -1., 1., 0., false); + db.getHierarchy().add(c, ev); + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + List<Clustering<?>> crs = ResultUtil.getClusteringResults(result); + if(crs.size() < 1) { + return; + } + Database db = ResultUtil.findDatabase(baseResult); + Relation<O> rel = db.getRelation(distance.getInputTypeRestriction()); + DistanceQuery<O> dq = db.getDistanceQuery(rel, distance); + for(Clustering<?> c : crs) { + evaluateClustering(db, rel, dq, c); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O> extends AbstractParameterizer { + /** + * Parameter for choosing the distance function. + */ + public static final OptionID DISTANCE_ID = new OptionID("silhouette.distance", "Distance function to use for computing the silhouette."); + + /** + * Parameter to treat noise as a single cluster. + */ + public static final OptionID MERGENOISE_ID = new OptionID("silhouette.noisecluster", "Treat noise as a cluster, not as singletons."); + + /** + * Distance function to use. + */ + private DistanceFunction<? super O> distance; + + /** + * Keep noise "clusters" merged. + */ + private boolean mergenoise = false; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<DistanceFunction<? super O>> distP = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + if(config.grab(distP)) { + distance = distP.instantiateClass(config); + } + + Flag noiseP = new Flag(MERGENOISE_ID); + if(config.grab(noiseP)) { + mergenoise = noiseP.isTrue(); + } + } + + @Override + protected EvaluateSilhouette<O> makeInstance() { + return new EvaluateSilhouette<>(distance, mergenoise); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/package-info.java new file mode 100644 index 00000000..952794c8 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/package-info.java @@ -0,0 +1,26 @@ +/** + * Internal evaluation measures for clusterings. + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.evaluation.clustering.internal;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java index 7dc244fd..666de5df 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java index 76508563..dcd972a7 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java index 76769d78..a9e2fbc9 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java index b7215579..ced8d726 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java index 01d0d09a..01d3bfa0 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |