summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java288
1 files changed, 288 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java
new file mode 100644
index 00000000..c82f5cf5
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java
@@ -0,0 +1,288 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.evaluation.Evaluator;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.BasicResult;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.result.Result;
+import de.lmu.ifi.dbs.elki.result.ResultUtil;
+import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable;
+import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Evaluate a clustering result by comparing it to an existing cluster label.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.landmark
+ * @apiviz.uses ClusterContingencyTable
+ * @apiviz.has EvaluateClustering.ScoreResult oneway - - «create»
+ */
+public class EvaluateClustering implements Evaluator {
+ /**
+ * Logger for debug output.
+ */
+ protected static final Logging logger = Logging.getLogger(EvaluateClustering.class);
+
+ /**
+ * Parameter to obtain the reference clustering. Defaults to a flat label
+ * clustering.
+ */
+ public static final OptionID REFERENCE_ID = OptionID.getOrCreateOptionID("paircounting.reference", "Reference clustering to compare with. Defaults to a by-label clustering.");
+
+ /**
+ * Parameter flag for special noise handling.
+ */
+ public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("paircounting.noisespecial", "Use special handling for noise clusters.");
+
+ /**
+ * Parameter flag to disable self-pairing
+ */
+ public static final OptionID SELFPAIR_ID = OptionID.getOrCreateOptionID("paircounting.selfpair", "Enable self-pairing for cluster comparison.");
+
+ /**
+ * Reference algorithm.
+ */
+ private ClusteringAlgorithm<?> referencealg;
+
+ /**
+ * Apply special handling to noise "clusters".
+ */
+ private boolean noiseSpecialHandling;
+
+ /**
+ * Use self-pairing in pair-counting measures
+ */
+ private boolean selfPairing;
+
+ /**
+ * Constructor.
+ *
+ * @param referencealg Reference clustering
+ * @param noiseSpecialHandling Noise handling flag
+ * @param selfPairing Self-pairing flag
+ */
+ public EvaluateClustering(ClusteringAlgorithm<?> referencealg, boolean noiseSpecialHandling, boolean selfPairing) {
+ super();
+ this.referencealg = referencealg;
+ this.noiseSpecialHandling = noiseSpecialHandling;
+ this.selfPairing = selfPairing;
+ }
+
+ @Override
+ public void processNewResult(HierarchicalResult baseResult, Result result) {
+ Database db = ResultUtil.findDatabase(baseResult);
+ List<Clustering<?>> crs = ResultUtil.getClusteringResults(result);
+ if(crs == null || crs.size() < 1) {
+ return;
+ }
+ // Compute the reference clustering
+ Clustering<?> refc = null;
+ // Try to find an existing reference clustering (globally)
+ if(refc == null) {
+ Iterator<Clustering<?>> cs = ResultUtil.filteredResults(baseResult, Clustering.class);
+ while(cs.hasNext()) {
+ Clustering<?> test = cs.next();
+ if(isReferenceResult(test)) {
+ refc = test;
+ break;
+ }
+ }
+ }
+ // Try to find an existing reference clustering (locally)
+ if(refc == null) {
+ Iterator<Clustering<?>> cs = ResultUtil.filteredResults(result, Clustering.class);
+ while(cs.hasNext()) {
+ Clustering<?> test = cs.next();
+ if(isReferenceResult(test)) {
+ refc = test;
+ break;
+ }
+ }
+ }
+ if(refc == null) {
+ logger.debug("Generating a new reference clustering.");
+ Result refres = referencealg.run(db);
+ List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres);
+ if(refcrs.size() == 0) {
+ logger.warning("Reference algorithm did not return a clustering result!");
+ return;
+ }
+ if(refcrs.size() > 1) {
+ logger.warning("Reference algorithm returned more than one result!");
+ }
+ refc = refcrs.get(0);
+ }
+ else {
+ logger.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName());
+ }
+ for(Clustering<?> c : crs) {
+ if(c == refc) {
+ continue;
+ }
+ ClusterContingencyTable contmat = new ClusterContingencyTable(selfPairing, noiseSpecialHandling);
+ contmat.process(refc, c);
+
+ db.getHierarchy().add(c, new ScoreResult(contmat));
+ }
+ }
+
+ private boolean isReferenceResult(Clustering<?> t) {
+ // FIXME: don't hard-code strings
+ if(t.getShortName().startsWith("bylabel-")) {
+ return true;
+ }
+ if(t.getShortName().startsWith("bymodel-")) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Result object for outlier score judgements.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf ClusterContingencyTable
+ */
+ public static class ScoreResult extends BasicResult implements TextWriteable {
+ /**
+ * Cluster contingency table
+ */
+ protected ClusterContingencyTable contmat;
+
+ /**
+ * Constructor.
+ *
+ * @param contmat score result
+ */
+ public ScoreResult(ClusterContingencyTable contmat) {
+ super("Cluster-Evalation", "cluster-evaluation");
+ this.contmat = contmat;
+ }
+
+ /**
+ * Get the contingency table
+ *
+ * @return the contingency table
+ */
+ public ClusterContingencyTable getContingencyTable() {
+ return contmat;
+ }
+
+ @Override
+ public void writeToText(TextWriterStream out, String label) {
+ out.commentPrint("Pair-F1, ");
+ out.commentPrint("Pair-Precision, ");
+ out.commentPrint("Pair-Recall, ");
+ out.commentPrint("Pair-Rand, ");
+ out.commentPrint("Pair-AdjustedRand, ");
+ out.commentPrint("Pair-FowlkesMallows, ");
+ out.commentPrint("Pair-Jaccard, ");
+ out.commentPrint("Pair-Mirkin, ");
+ out.commentPrint("Entropy-VI, ");
+ out.commentPrint("Entropy-NormalizedVI, ");
+ out.commentPrint("Entropy-F1, ");
+ out.commentPrint("Edit-F1, ");
+ out.commentPrint("SM-InvPurity, ");
+ out.commentPrint("SM-Purity, ");
+ out.commentPrint("SM-F1, ");
+ out.commentPrint("BCubed-Precision, ");
+ out.commentPrint("BCubed-Recall, ");
+ out.commentPrint("BCubed-F1");
+ out.flush();
+ out.inlinePrint(contmat.getPaircount().f1Measure());
+ out.inlinePrint(contmat.getPaircount().precision());
+ out.inlinePrint(contmat.getPaircount().recall());
+ out.inlinePrint(contmat.getPaircount().randIndex());
+ out.inlinePrint(contmat.getPaircount().adjustedRandIndex());
+ out.inlinePrint(contmat.getPaircount().fowlkesMallows());
+ out.inlinePrint(contmat.getPaircount().jaccard());
+ out.inlinePrint(contmat.getPaircount().mirkin());
+ out.inlinePrint(contmat.getEntropy().variationOfInformation());
+ out.inlinePrint(contmat.getEntropy().normalizedVariationOfInformation());
+ out.inlinePrint(contmat.getEdit().f1Measure());
+ out.inlinePrint(contmat.getSetMatching().inversePurity());
+ out.inlinePrint(contmat.getSetMatching().purity());
+ out.inlinePrint(contmat.getSetMatching().f1Measure());
+ out.inlinePrint(contmat.getBCubed().precision());
+ out.inlinePrint(contmat.getBCubed().recall());
+ out.inlinePrint(contmat.getBCubed().f1Measure());
+ out.flush();
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ protected ClusteringAlgorithm<?> referencealg = null;
+
+ protected boolean noiseSpecialHandling = false;
+
+ protected boolean selfPairing = false;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<ClusteringAlgorithm<?>> referencealgP = new ObjectParameter<ClusteringAlgorithm<?>>(REFERENCE_ID, ClusteringAlgorithm.class, ByLabelClustering.class);
+ if(config.grab(referencealgP)) {
+ referencealg = referencealgP.instantiateClass(config);
+ }
+
+ Flag noiseSpecialHandlingF = new Flag(NOISE_ID);
+ if(config.grab(noiseSpecialHandlingF)) {
+ noiseSpecialHandling = noiseSpecialHandlingF.getValue();
+ }
+
+ Flag selfPairingF = new Flag(SELFPAIR_ID);
+ if(config.grab(selfPairingF)) {
+ selfPairing = selfPairingF.getValue();
+ }
+ }
+
+ @Override
+ protected EvaluateClustering makeInstance() {
+ return new EvaluateClustering(referencealg, noiseSpecialHandling, !selfPairing);
+ }
+ }
+} \ No newline at end of file