diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java new file mode 100644 index 00000000..c82f5cf5 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java @@ -0,0 +1,288 @@ +package de.lmu.ifi.dbs.elki.evaluation.clustering; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.result.BasicResult; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Evaluate a clustering result by comparing it to an existing cluster label. + * + * @author Erich Schubert + * + * @apiviz.landmark + * @apiviz.uses ClusterContingencyTable + * @apiviz.has EvaluateClustering.ScoreResult oneway - - «create» + */ +public class EvaluateClustering implements Evaluator { + /** + * Logger for debug output. + */ + protected static final Logging logger = Logging.getLogger(EvaluateClustering.class); + + /** + * Parameter to obtain the reference clustering. Defaults to a flat label + * clustering. + */ + public static final OptionID REFERENCE_ID = OptionID.getOrCreateOptionID("paircounting.reference", "Reference clustering to compare with. Defaults to a by-label clustering."); + + /** + * Parameter flag for special noise handling. + */ + public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("paircounting.noisespecial", "Use special handling for noise clusters."); + + /** + * Parameter flag to disable self-pairing + */ + public static final OptionID SELFPAIR_ID = OptionID.getOrCreateOptionID("paircounting.selfpair", "Enable self-pairing for cluster comparison."); + + /** + * Reference algorithm. + */ + private ClusteringAlgorithm<?> referencealg; + + /** + * Apply special handling to noise "clusters". + */ + private boolean noiseSpecialHandling; + + /** + * Use self-pairing in pair-counting measures + */ + private boolean selfPairing; + + /** + * Constructor. + * + * @param referencealg Reference clustering + * @param noiseSpecialHandling Noise handling flag + * @param selfPairing Self-pairing flag + */ + public EvaluateClustering(ClusteringAlgorithm<?> referencealg, boolean noiseSpecialHandling, boolean selfPairing) { + super(); + this.referencealg = referencealg; + this.noiseSpecialHandling = noiseSpecialHandling; + this.selfPairing = selfPairing; + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + Database db = ResultUtil.findDatabase(baseResult); + List<Clustering<?>> crs = ResultUtil.getClusteringResults(result); + if(crs == null || crs.size() < 1) { + return; + } + // Compute the reference clustering + Clustering<?> refc = null; + // Try to find an existing reference clustering (globally) + if(refc == null) { + Iterator<Clustering<?>> cs = ResultUtil.filteredResults(baseResult, Clustering.class); + while(cs.hasNext()) { + Clustering<?> test = cs.next(); + if(isReferenceResult(test)) { + refc = test; + break; + } + } + } + // Try to find an existing reference clustering (locally) + if(refc == null) { + Iterator<Clustering<?>> cs = ResultUtil.filteredResults(result, Clustering.class); + while(cs.hasNext()) { + Clustering<?> test = cs.next(); + if(isReferenceResult(test)) { + refc = test; + break; + } + } + } + if(refc == null) { + logger.debug("Generating a new reference clustering."); + Result refres = referencealg.run(db); + List<Clustering<?>> refcrs = ResultUtil.getClusteringResults(refres); + if(refcrs.size() == 0) { + logger.warning("Reference algorithm did not return a clustering result!"); + return; + } + if(refcrs.size() > 1) { + logger.warning("Reference algorithm returned more than one result!"); + } + refc = refcrs.get(0); + } + else { + logger.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName()); + } + for(Clustering<?> c : crs) { + if(c == refc) { + continue; + } + ClusterContingencyTable contmat = new ClusterContingencyTable(selfPairing, noiseSpecialHandling); + contmat.process(refc, c); + + db.getHierarchy().add(c, new ScoreResult(contmat)); + } + } + + private boolean isReferenceResult(Clustering<?> t) { + // FIXME: don't hard-code strings + if(t.getShortName().startsWith("bylabel-")) { + return true; + } + if(t.getShortName().startsWith("bymodel-")) { + return true; + } + return false; + } + + /** + * Result object for outlier score judgements. + * + * @author Erich Schubert + * + * @apiviz.composedOf ClusterContingencyTable + */ + public static class ScoreResult extends BasicResult implements TextWriteable { + /** + * Cluster contingency table + */ + protected ClusterContingencyTable contmat; + + /** + * Constructor. + * + * @param contmat score result + */ + public ScoreResult(ClusterContingencyTable contmat) { + super("Cluster-Evalation", "cluster-evaluation"); + this.contmat = contmat; + } + + /** + * Get the contingency table + * + * @return the contingency table + */ + public ClusterContingencyTable getContingencyTable() { + return contmat; + } + + @Override + public void writeToText(TextWriterStream out, String label) { + out.commentPrint("Pair-F1, "); + out.commentPrint("Pair-Precision, "); + out.commentPrint("Pair-Recall, "); + out.commentPrint("Pair-Rand, "); + out.commentPrint("Pair-AdjustedRand, "); + out.commentPrint("Pair-FowlkesMallows, "); + out.commentPrint("Pair-Jaccard, "); + out.commentPrint("Pair-Mirkin, "); + out.commentPrint("Entropy-VI, "); + out.commentPrint("Entropy-NormalizedVI, "); + out.commentPrint("Entropy-F1, "); + out.commentPrint("Edit-F1, "); + out.commentPrint("SM-InvPurity, "); + out.commentPrint("SM-Purity, "); + out.commentPrint("SM-F1, "); + out.commentPrint("BCubed-Precision, "); + out.commentPrint("BCubed-Recall, "); + out.commentPrint("BCubed-F1"); + out.flush(); + out.inlinePrint(contmat.getPaircount().f1Measure()); + out.inlinePrint(contmat.getPaircount().precision()); + out.inlinePrint(contmat.getPaircount().recall()); + out.inlinePrint(contmat.getPaircount().randIndex()); + out.inlinePrint(contmat.getPaircount().adjustedRandIndex()); + out.inlinePrint(contmat.getPaircount().fowlkesMallows()); + out.inlinePrint(contmat.getPaircount().jaccard()); + out.inlinePrint(contmat.getPaircount().mirkin()); + out.inlinePrint(contmat.getEntropy().variationOfInformation()); + out.inlinePrint(contmat.getEntropy().normalizedVariationOfInformation()); + out.inlinePrint(contmat.getEdit().f1Measure()); + out.inlinePrint(contmat.getSetMatching().inversePurity()); + out.inlinePrint(contmat.getSetMatching().purity()); + out.inlinePrint(contmat.getSetMatching().f1Measure()); + out.inlinePrint(contmat.getBCubed().precision()); + out.inlinePrint(contmat.getBCubed().recall()); + out.inlinePrint(contmat.getBCubed().f1Measure()); + out.flush(); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + protected ClusteringAlgorithm<?> referencealg = null; + + protected boolean noiseSpecialHandling = false; + + protected boolean selfPairing = false; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<ClusteringAlgorithm<?>> referencealgP = new ObjectParameter<ClusteringAlgorithm<?>>(REFERENCE_ID, ClusteringAlgorithm.class, ByLabelClustering.class); + if(config.grab(referencealgP)) { + referencealg = referencealgP.instantiateClass(config); + } + + Flag noiseSpecialHandlingF = new Flag(NOISE_ID); + if(config.grab(noiseSpecialHandlingF)) { + noiseSpecialHandling = noiseSpecialHandlingF.getValue(); + } + + Flag selfPairingF = new Flag(SELFPAIR_ID); + if(config.grab(selfPairingF)) { + selfPairing = selfPairingF.getValue(); + } + } + + @Override + protected EvaluateClustering makeInstance() { + return new EvaluateClustering(referencealg, noiseSpecialHandling, !selfPairing); + } + } +}
\ No newline at end of file |