diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation')
19 files changed, 1577 insertions, 406 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java new file mode 100644 index 00000000..888777a5 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java @@ -0,0 +1,159 @@ +package de.lmu.ifi.dbs.elki.evaluation; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Collection; +import java.util.Iterator; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.evaluation.clustering.EvaluateClustering; +import de.lmu.ifi.dbs.elki.evaluation.histogram.ComputeOutlierHistogram; +import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierPrecisionAtKCurve; +import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierPrecisionRecallCurve; +import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierROCCurve; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.scaling.LinearScaling; + +/** + * Evaluator that tries to auto-run a number of evaluation methods. + * + * @author Erich Schubert + * + * @apiviz.uses OutlierResult + * @apiviz.uses Clustering + * @apiviz.composedOf OutlierROCCurve + * @apiviz.composedOf OutlierPrecisionAtKCurve + * @apiviz.composedOf OutlierPrecisionRecallCurve + * @apiviz.composedOf ComputeOutlierHistogram + * @apiviz.composedOf EvaluateClustering + */ +public class AutomaticEvaluation implements Evaluator { + /** + * Class logger + */ + private static final Logging logger = Logging.getLogger(AutomaticEvaluation.class); + + @Override + public void processNewResult(HierarchicalResult baseResult, Result newResult) { + autoEvaluateClusterings(baseResult, newResult); + autoEvaluateOutliers(baseResult, newResult); + } + + protected void autoEvaluateOutliers(HierarchicalResult baseResult, Result newResult) { + Collection<OutlierResult> outliers = ResultUtil.filterResults(newResult, OutlierResult.class); + if(logger.isDebugging()) { + logger.debug("Number of new outlier results: " + outliers.size()); + } + if(outliers.size() > 0) { + ResultUtil.ensureClusteringResult(ResultUtil.findDatabase(baseResult), baseResult); + Collection<Clustering<?>> clusterings = ResultUtil.filterResults(baseResult, Clustering.class); + if(clusterings.size() == 0) { + logger.warning("Could not find a clustering result, even after running 'ensureClusteringResult'?!?"); + return; + } + Clustering<?> basec = clusterings.iterator().next(); + // Find minority class label + int min = Integer.MAX_VALUE; + int total = 0; + String label = null; + if(basec.getAllClusters().size() > 1) { + for(Cluster<?> c : basec.getAllClusters()) { + final int csize = c.getIDs().size(); + total += csize; + if(csize < min) { + min = csize; + label = c.getName(); + } + } + } + if(label == null) { + logger.warning("Could not evaluate outlier results, as I could not find a minority label."); + return; + } + if(min == 1) { + logger.warning("The minority class label had a single object. Try using 'ClassLabelFilter' to identify the class label column."); + } + if(min > 0.05 * total) { + logger.warning("The minority class I discovered (labeled '" + label + "') has " + (min * 100. / total) + "% of objects. Outlier classes should be more rare!"); + } + logger.verbose("Evaluating using minority class: " + label); + Pattern pat = Pattern.compile("^" + Pattern.quote(label) + "$"); + // Compute ROC curve + new OutlierROCCurve(pat).processNewResult(baseResult, newResult); + // Compute Precision at k + new OutlierPrecisionAtKCurve(pat, min * 2).processNewResult(baseResult, newResult); + // Compute ROC curve + new OutlierPrecisionRecallCurve(pat).processNewResult(baseResult, newResult); + // Compute outlier histogram + new ComputeOutlierHistogram(pat, 50, new LinearScaling(), false).processNewResult(baseResult, newResult); + } + } + + protected void autoEvaluateClusterings(HierarchicalResult baseResult, Result newResult) { + Collection<Clustering<?>> clusterings = ResultUtil.filterResults(newResult, Clustering.class); + if(logger.isDebugging()) { + logger.warning("Number of new clustering results: " + clusterings.size()); + } + for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext();) { + Clustering<?> test = c.next(); + if ("allinone-clustering".equals(test.getShortName())) { + c.remove(); + } + else if ("allinnoise-clustering".equals(test.getShortName())) { + c.remove(); + } + else if ("bylabel-clustering".equals(test.getShortName())) { + c.remove(); + } + else if ("bymodel-clustering".equals(test.getShortName())) { + c.remove(); + } + } + if (clusterings.size() > 0) { + new EvaluateClustering(new ByLabelOrAllInOneClustering(), false, true).processNewResult(baseResult, newResult); + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected AutomaticEvaluation makeInstance() { + return new AutomaticEvaluation(); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java new file mode 100644 index 00000000..453a337f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java @@ -0,0 +1,56 @@ +package de.lmu.ifi.dbs.elki.evaluation; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * No-operation evaluator, that only serves the purpose of explicitely disabling + * the default value of {@link AutomaticEvaluation}, if you do not want + * evaluation to run. + * + * @author Erich Schubert + */ +public class NoAutomaticEvaluation implements Evaluator { + @Override + public void processNewResult(HierarchicalResult baseResult, Result newResult) { + return; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected NoAutomaticEvaluation makeInstance() { + return new NoAutomaticEvaluation(); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java index d8087cf6..fdfaaaa9 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java @@ -69,7 +69,6 @@ public class BCubed { bCubedRecall += (recall * table.contingency[i1][i2]); } } - bCubedPrecision = bCubedPrecision / table.contingency[table.size1][table.size2]; bCubedRecall = bCubedRecall / table.contingency[table.size1][table.size2]; } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java index ac156a4e..70f6e6ce 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java @@ -28,7 +28,7 @@ import java.util.List; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.math.MeanVariance; @@ -37,6 +37,15 @@ import de.lmu.ifi.dbs.elki.math.MeanVariance; * Class storing the contingency table and related data on two clusterings. * * @author Erich Schubert + * + * @apiviz.landmark + * + * @apiviz.uses Clustering + * @apiviz.composedOf PairCounting + * @apiviz.composedOf Entropy + * @apiviz.composedOf EditDistance + * @apiviz.composedOf BCubed + * @apiviz.composedOf SetMatchingPurity */ public class ClusterContingencyTable { /** @@ -156,8 +165,8 @@ public class ClusterContingencyTable { for(int i2 = 0; it2.hasNext(); i2++) { final Cluster<?> c2 = it2.next(); int count = 0; - for(DBID id : c2.getIDs()) { - if(ids.contains(id)) { + for(DBIDIter iter = c2.getIDs().iter(); iter.valid(); iter.advance()) { + if(ids.contains(iter.getDBID())) { count++; } } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java index ded70327..b174f3ed 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java @@ -168,6 +168,9 @@ public class Entropy { * @return Joint Normalized Mutual information */ public double entropyNMIJoint() { + if (entropyJoint() == 0) { + return 0; + } return (entropyMutualInformation() / entropyJoint()); } @@ -204,6 +207,9 @@ public class Entropy { * @return Sqrt Normalized Mutual information */ public double entropyNMISqrt() { + if (entropyFirst() * entropySecond() <= 0) { + return entropyMutualInformation(); + } return (entropyMutualInformation() / Math.sqrt(entropyFirst() * entropySecond())); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java index c82f5cf5..a57fa188 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java @@ -23,11 +23,11 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.Iterator; +import java.util.Collection; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; @@ -115,9 +115,8 @@ public class EvaluateClustering implements Evaluator { Clustering<?> refc = null; // Try to find an existing reference clustering (globally) if(refc == null) { - Iterator<Clustering<?>> cs = ResultUtil.filteredResults(baseResult, Clustering.class); - while(cs.hasNext()) { - Clustering<?> test = cs.next(); + Collection<Clustering<?>> cs = ResultUtil.filterResults(baseResult, Clustering.class); + for(Clustering<?> test : cs) { if(isReferenceResult(test)) { refc = test; break; @@ -126,9 +125,8 @@ public class EvaluateClustering implements Evaluator { } // Try to find an existing reference clustering (locally) if(refc == null) { - Iterator<Clustering<?>> cs = ResultUtil.filteredResults(result, Clustering.class); - while(cs.hasNext()) { - Clustering<?> test = cs.next(); + Collection<Clustering<?>> cs = ResultUtil.filterResults(result, Clustering.class); + for(Clustering<?> test : cs) { if(isReferenceResult(test)) { refc = test; break; @@ -164,10 +162,16 @@ public class EvaluateClustering implements Evaluator { private boolean isReferenceResult(Clustering<?> t) { // FIXME: don't hard-code strings - if(t.getShortName().startsWith("bylabel-")) { + if("bylabel-clustering".equals(t.getShortName())) { return true; } - if(t.getShortName().startsWith("bymodel-")) { + if("bymodel-clustering".equals(t.getShortName())) { + return true; + } + if("allinone-clustering".equals(t.getShortName())) { + return true; + } + if("allinnoise-clustering".equals(t.getShortName())) { return true; } return false; @@ -264,7 +268,7 @@ public class EvaluateClustering implements Evaluator { @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<ClusteringAlgorithm<?>> referencealgP = new ObjectParameter<ClusteringAlgorithm<?>>(REFERENCE_ID, ClusteringAlgorithm.class, ByLabelClustering.class); + ObjectParameter<ClusteringAlgorithm<?>> referencealgP = new ObjectParameter<ClusteringAlgorithm<?>>(REFERENCE_ID, ClusteringAlgorithm.class, ByLabelOrAllInOneClustering.class); if(config.grab(referencealgP)) { referencealg = referencealgP.instantiateClass(config); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java index 2471b135..123cd4a1 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java @@ -1,4 +1,26 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.util.List; diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java index 8ce8fd4a..00700385 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java @@ -1,4 +1,26 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.util.Arrays; diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java index 97c79dfb..ca352367 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java @@ -1,5 +1,28 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -9,6 +32,7 @@ import java.util.TreeMap; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; @@ -47,6 +71,8 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Sascha Goldhofer * @author Erich Schubert + * + * @apiviz.composedOf Segment */ @Reference(title = "Evaluation of Clusterings – Metrics and Visual Support", authors = "Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", booktitle = "Proc. 28th International Conference on Data Engineering (ICDE) 2012", url = "http://elki.dbs.ifi.lmu.de/wiki/PairSegments") public class Segments extends BasicResult implements Iterable<Segment> { @@ -154,7 +180,8 @@ public class Segments extends BasicResult implements Iterable<Segment> { HashSetModifiableDBIDs ndelta1 = DBIDUtil.newHashSet(first); HashSetModifiableDBIDs ndelta2 = DBIDUtil.newHashSet(); HashSetModifiableDBIDs nsecond = DBIDUtil.newHashSet(second.size()); - for(DBID id : clust.getIDs()) { + for(DBIDIter iter2 = clust.getIDs().iter(); iter2.valid(); iter2.advance()) { + DBID id = iter2.getDBID(); if(ndelta1.remove(id)) { nfirstp.add(id); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java b/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java index 124abf9e..2a2bb069 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.evaluation.histogram;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2012 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.ArrayList;
import java.util.Collection;
@@ -29,7 +30,7 @@ import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -68,6 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction; * @author Erich Schubert
*
* @apiviz.landmark
+ * @apiviz.uses OutlierResult
* @apiviz.has ScalingFunction
* @apiviz.has HistogramResult oneway - - «create»
*/
@@ -186,13 +188,13 @@ public class ComputeOutlierHistogram implements Evaluator { }
ids.removeDBIDs(outlierIds);
// fill histogram with values of each object
- for(DBID id : ids) {
- double result = or.getScores().get(id);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double result = or.getScores().get(iter);
result = scaling.getScaled(result);
hist.aggregate(result, negative);
}
- for(DBID id : outlierIds) {
- double result = or.getScores().get(id);
+ for(DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
+ double result = or.getScores().get(iter);
result = scaling.getScaled(result);
hist.aggregate(result, positive);
}
@@ -223,11 +225,11 @@ public class ComputeOutlierHistogram implements Evaluator { }
/**
- * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude + * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java index b92e5bde..49775b1a 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java @@ -28,7 +28,7 @@ import java.util.List; import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -147,13 +147,13 @@ public class JudgeOutlierScores implements Evaluator { double posscore = 0.0;
double negscore = 0.0;
// fill histogram with values of each object
- for(DBID id : ids) {
- double result = or.getScores().get(id);
+ for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double result = or.getScores().get(iter);
result = innerScaling.getScaled(scaling.getScaled(result));
posscore += (1.0 - result);
}
- for(DBID id : outlierIds) {
- double result = or.getScores().get(id);
+ for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
+ double result = or.getScores().get(iter);
result = innerScaling.getScaled(scaling.getScaled(result));
negscore += result;
}
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java new file mode 100644 index 00000000..6d65999e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java @@ -0,0 +1,227 @@ +package de.lmu.ifi.dbs.elki.evaluation.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.OrderingResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Compute a curve containing the precision values for an outlier detection + * method. + * + * @author Erich Schubert + * + * @apiviz.has PrecisionAtKCurve + */ +public class OutlierPrecisionAtKCurve implements Evaluator { + /** + * The logger. + */ + private static final Logging logger = Logging.getLogger(OutlierPrecisionAtKCurve.class); + + /** + * The pattern to identify positive classes. + * + * <p> + * Key: {@code -precision.positive} + * </p> + */ + public static final OptionID POSITIVE_CLASS_NAME_ID = OptionID.getOrCreateOptionID("precision.positive", "Class label for the 'positive' class."); + + /** + * Maximum value for k + * + * <p> + * Key: {@code -precision.k} + * </p> + */ + public static final OptionID MAX_K_ID = OptionID.getOrCreateOptionID("precision.maxk", "Maximum value of 'k' to compute the curve up to."); + + /** + * Stores the "positive" class. + */ + private Pattern positiveClassName; + + /** + * Maximum value for k + */ + int maxk = Integer.MAX_VALUE; + + /** + * Constructor. + * + * @param positiveClassName Pattern to recognize outliers + * @param maxk Maximum value for k + */ + public OutlierPrecisionAtKCurve(Pattern positiveClassName, int maxk) { + super(); + this.positiveClassName = positiveClassName; + this.maxk = maxk; + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + Database db = ResultUtil.findDatabase(baseResult); + // Prepare + SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); + + if(positiveids.size() == 0) { + logger.warning("Computing a ROC curve failed - no objects matched."); + return; + } + + List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); + List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); + // Outlier results are the main use case. + for(OutlierResult o : oresults) { + DBIDs sorted = o.getOrdering().iter(o.getOrdering().getDBIDs()); + db.getHierarchy().add(o, computePrecisionResult(o.getScores().size(), positiveids, sorted)); + // Process them only once. + orderings.remove(o.getOrdering()); + } + + // FIXME: find appropriate place to add the derived result + // otherwise apply an ordering to the database IDs. + for(OrderingResult or : orderings) { + DBIDs sorted = or.iter(or.getDBIDs()); + db.getHierarchy().add(or, computePrecisionResult(or.getDBIDs().size(), positiveids, sorted)); + } + } + + private XYCurve computePrecisionResult(int size, SetDBIDs positiveids, DBIDs order) { + if(order.size() != size) { + throw new IllegalStateException("Iterable result doesn't match database size - incomplete ordering?"); + } + int lastk = Math.min(size, maxk); + XYCurve curve = new PrecisionAtKCurve("k", "Precision", lastk); + + int pos = 0; + DBIDIter i = order.iter(); + for(int k = 1; k <= lastk; k++, i.advance()) { + if(positiveids.contains(i.getDBID())) { + pos++; + } + curve.addAndSimplify(k, pos / (double) k); + } + if(logger.isVerbose()) { + logger.verbose("Precision @ " + lastk + " " + ((pos * 1.0) / lastk)); + } + return curve; + } + + /** + * Precision at K curve. + * + * @author Erich Schubert + */ + public static class PrecisionAtKCurve extends XYCurve { + /** + * Constructor. + * + * @param size Size estimation + */ + public PrecisionAtKCurve(String labelx, String labely, int size) { + super("k", "Precision", size); + } + + @Override + public String getLongName() { + return "Precision @ k Curve"; + } + + @Override + public String getShortName() { + return "precision-at-k"; + } + + @Override + public void writeToText(TextWriterStream out, String label) { + final int last = size() - 1; + out.commentPrintLn("Precision @ " + ((int) getX(last)) + ": " + getY(last)); + out.commentPrintSeparator(); + out.flush(); + out.commentPrint(labelx); + out.commentPrint(" "); + out.commentPrint(labely); + out.flush(); + for(int pos = 0; pos < data.size(); pos+=2) { + out.inlinePrint((int)data.get(pos)); + out.inlinePrint(data.get(pos + 1)); + out.flush(); + } + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + protected Pattern positiveClassName = null; + + protected int maxk = Integer.MAX_VALUE; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); + if(config.grab(positiveClassNameP)) { + positiveClassName = positiveClassNameP.getValue(); + } + IntParameter maxkP = new IntParameter(MAX_K_ID, true); + if(config.grab(maxkP)) { + maxk = maxkP.getValue(); + } + } + + @Override + protected OutlierPrecisionAtKCurve makeInstance() { + return new OutlierPrecisionAtKCurve(positiveClassName, maxk); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java new file mode 100644 index 00000000..c839016c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java @@ -0,0 +1,238 @@ +package de.lmu.ifi.dbs.elki.evaluation.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.OrderingResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Compute a curve containing the precision values for an outlier detection + * method. + * + * @author Erich Schubert + * + * @apiviz.has PRCurve + */ +public class OutlierPrecisionRecallCurve implements Evaluator { + /** + * The logger. + */ + private static final Logging logger = Logging.getLogger(OutlierPrecisionRecallCurve.class); + + /** + * The pattern to identify positive classes. + * + * <p> + * Key: {@code -precision.positive} + * </p> + */ + public static final OptionID POSITIVE_CLASS_NAME_ID = OptionID.getOrCreateOptionID("precision.positive", "Class label for the 'positive' class."); + + /** + * Stores the "positive" class. + */ + private Pattern positiveClassName; + + /** + * Constructor. + * + * @param positiveClassName Pattern to recognize outliers + */ + public OutlierPrecisionRecallCurve(Pattern positiveClassName) { + super(); + this.positiveClassName = positiveClassName; + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + Database db = ResultUtil.findDatabase(baseResult); + // Prepare + SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); + + if(positiveids.size() == 0) { + logger.warning("Computing a ROC curve failed - no objects matched."); + return; + } + + List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); + List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); + // Outlier results are the main use case. + for(OutlierResult o : oresults) { + DBIDs sorted = o.getOrdering().iter(o.getOrdering().getDBIDs()); + XYCurve curve = computePrecisionResult(o.getScores().size(), positiveids, sorted.iter(), o.getScores()); + db.getHierarchy().add(o, curve); + // Process them only once. + orderings.remove(o.getOrdering()); + } + + // FIXME: find appropriate place to add the derived result + // otherwise apply an ordering to the database IDs. + for(OrderingResult or : orderings) { + DBIDs sorted = or.iter(or.getDBIDs()); + XYCurve curve = computePrecisionResult(or.getDBIDs().size(), positiveids, sorted.iter(), null); + db.getHierarchy().add(or, curve); + } + } + + private XYCurve computePrecisionResult(int size, SetDBIDs ids, DBIDIter iter, Relation<Double> scores) { + final int postot = ids.size(); + int poscnt = 0, total = 0; + XYCurve curve = new PRCurve(postot + 2); + + double prevscore = Double.NaN; + for(; iter.valid(); iter.advance()) { + // Previous precision rate - y axis + final double curprec = ((double) poscnt) / total; + // Previous recall rate - x axis + final double curreca = ((double) poscnt) / postot; + + // Analyze next point + DBID cur = iter.getDBID(); + // positive or negative match? + if(ids.contains(cur)) { + poscnt += 1; + } + total += 1; + // First iteration ends here + if(total == 1) { + continue; + } + // defer calculation for ties + if(scores != null) { + double curscore = scores.get(cur); + if(Double.compare(prevscore, curscore) == 0) { + continue; + } + prevscore = curscore; + } + // Add a new point (for the previous entry - because of tie handling!) + curve.addAndSimplify(curreca, curprec); + } + // End curve - always at all positives found. + curve.addAndSimplify(1.0, postot / total); + return curve; + } + + /** + * P/R Curve + * + * @author Erich Schubert + */ + public static class PRCurve extends XYCurve { + /** + * AUC value for PR curve + */ + public static final String PRAUC_LABEL = "PR-AUC"; + + /** + * Area under curve + */ + double auc = Double.NaN; + + /** + * Constructor. + * + * @param size Size estimation + */ + public PRCurve(int size) { + super("Recall", "Precision", size); + } + + @Override + public String getLongName() { + return "Precision-Recall-Curve"; + } + + @Override + public String getShortName() { + return "pr-curve"; + } + + /** + * Get AUC value + * + * @return AUC value + */ + public double getAUC() { + if(Double.isNaN(auc)) { + auc = areaUnderCurve(this); + } + return auc; + } + + @Override + public void writeToText(TextWriterStream out, String label) { + out.commentPrintLn(PRAUC_LABEL + ": " + getAUC()); + out.flush(); + super.writeToText(out, label); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + protected Pattern positiveClassName = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); + if(config.grab(positiveClassNameP)) { + positiveClassName = positiveClassNameP.getValue(); + } + } + + @Override + protected OutlierPrecisionRecallCurve makeInstance() { + return new OutlierPrecisionRecallCurve(positiveClassName); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java new file mode 100644 index 00000000..ec73d67f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java @@ -0,0 +1,241 @@ +package de.lmu.ifi.dbs.elki.evaluation.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.OrderingResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Compute a ROC curve to evaluate a ranking algorithm and compute the + * corresponding ROCAUC value. + * + * The parameter {@code -rocauc.positive} specifies the class label of + * "positive" hits. + * + * The nested algorithm {@code -algorithm} will be run, the result will be + * searched for an iterable or ordering result, which then is compared with the + * clustering obtained via the given class label. + * + * @author Erich Schubert + * + * @apiviz.landmark + * + * @apiviz.uses OutlierResult + * @apiviz.uses ROC + * @apiviz.has ROCResult oneway - - «create» + */ +// TODO: maybe add a way to process clustering results as well? +public class OutlierROCCurve implements Evaluator { + /** + * The label we use for marking ROCAUC values. + */ + public static final String ROCAUC_LABEL = "ROCAUC"; + + /** + * The logger. + */ + private static final Logging logger = Logging.getLogger(OutlierROCCurve.class); + + /** + * The pattern to identify positive classes. + * + * <p> + * Key: {@code -rocauc.positive} + * </p> + */ + public static final OptionID POSITIVE_CLASS_NAME_ID = OptionID.getOrCreateOptionID("rocauc.positive", "Class label for the 'positive' class."); + + /** + * Stores the "positive" class. + */ + private Pattern positiveClassName; + + /** + * Constructor. + * + * @param positive_class_name Positive class name pattern + */ + public OutlierROCCurve(Pattern positive_class_name) { + super(); + this.positiveClassName = positive_class_name; + } + + private ROCResult computeROCResult(int size, SetDBIDs positiveids, DBIDs order) { + if(order.size() != size) { + throw new IllegalStateException("Iterable result doesn't match database size - incomplete ordering?"); + } + XYCurve roccurve = ROC.materializeROC(size, positiveids, new ROC.SimpleAdapter(order.iter())); + double rocauc = XYCurve.areaUnderCurve(roccurve); + if(logger.isVerbose()) { + logger.verbose(ROCAUC_LABEL + ": " + rocauc); + } + + final ROCResult rocresult = new ROCResult(roccurve, rocauc); + + return rocresult; + } + + private ROCResult computeROCResult(int size, SetDBIDs positiveids, OutlierResult or) { + XYCurve roccurve = ROC.materializeROC(size, positiveids, new ROC.OutlierScoreAdapter(or)); + double rocauc = XYCurve.areaUnderCurve(roccurve); + if(logger.isVerbose()) { + logger.verbose(ROCAUC_LABEL + ": " + rocauc); + } + + final ROCResult rocresult = new ROCResult(roccurve, rocauc); + + return rocresult; + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + Database db = ResultUtil.findDatabase(baseResult); + // Prepare + SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); + + if(positiveids.size() == 0) { + logger.warning("Computing a ROC curve failed - no objects matched."); + return; + } + + boolean nonefound = true; + List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); + List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); + // Outlier results are the main use case. + for(OutlierResult o : oresults) { + db.getHierarchy().add(o, computeROCResult(o.getScores().size(), positiveids, o)); + // Process them only once. + orderings.remove(o.getOrdering()); + nonefound = false; + } + + // FIXME: find appropriate place to add the derived result + // otherwise apply an ordering to the database IDs. + for(OrderingResult or : orderings) { + DBIDs sorted = or.iter(or.getDBIDs()); + db.getHierarchy().add(or, computeROCResult(or.getDBIDs().size(), positiveids, sorted)); + nonefound = false; + } + + if(nonefound) { + return; + // logger.warning("No results found to process with ROC curve analyzer. Got "+iterables.size()+" iterables, "+orderings.size()+" orderings."); + } + } + + /** + * Result object for ROC curves. + * + * @author Erich Schubert + */ + public static class ROCResult extends XYCurve { + /** + * AUC value + */ + private double auc; + + /** + * Constructor. + * + * @param col roc curve + * @param rocauc ROC AUC value + */ + public ROCResult(XYCurve col, double rocauc) { + super(col); + this.auc = rocauc; + } + + /** + * @return the area under curve + */ + public double getAUC() { + return auc; + } + + @Override + public String getLongName() { + return "ROC Curve"; + } + + @Override + public String getShortName() { + return "roc-curve"; + } + + @Override + public void writeToText(TextWriterStream out, String label) { + out.commentPrintLn(ROCAUC_LABEL + ": " + auc); + out.flush(); + super.writeToText(out, label); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Pattern for positive class. + */ + protected Pattern positiveClassName = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); + if(config.grab(positiveClassNameP)) { + positiveClassName = positiveClassNameP.getValue(); + } + } + + @Override + protected OutlierROCCurve makeInstance() { + return new OutlierROCCurve(positiveClassName); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java new file mode 100644 index 00000000..928ac842 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java @@ -0,0 +1,269 @@ +package de.lmu.ifi.dbs.elki.evaluation.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.OrderingResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Smooth ROC curves are a variation of classic ROC curves that takes the scores + * into account. + * + * Reference: + * <p> + * W. Klement, P. A. Flach, N. Japkowicz, S. Matwin<br /> + * Smooth Receiver Operating Characteristics (smROC) Curves.<br /> + * In: European Conference on Machine Learning and Principles and Practice of + * Knowledge Discovery in Databases (ECML-PKDD'11) + * </p> + * + * However, this method has some deficiencies when the mean score is not 0.5, as + * discussed in: + * <p> + * E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br> + * On Evaluation of Outlier Rankings and Outlier Scores<br> + * In Proceedings of the 12th SIAM International Conference on Data Mining + * (SDM), Anaheim, CA, 2012. + * </p> + * + * @author Erich Schubert + * + * @apiviz.landmark + * + * @apiviz.uses OutlierResult + * @apiviz.has SmROCResult oneway - - «create» + */ +@Reference(authors = "W. Klement, P. A. Flach, N. Japkowicz, S. Matwin", title = "Smooth Receiver Operating Characteristics (smROC) Curves", booktitle = "In: European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD'11)", url = "http://dx.doi.org/10.1007/978-3-642-23783-6_13") +public class OutlierSmROCCurve implements Evaluator { + /** + * The label we use for marking ROCAUC values. + */ + public static final String SMROCAUC_LABEL = "ROCAUC"; + + /** + * The logger. + */ + private static final Logging logger = Logging.getLogger(OutlierSmROCCurve.class); + + /** + * Stores the "positive" class. + */ + private Pattern positiveClassName; + + /** + * Constructor. + * + * @param positive_class_name Positive class name pattern + */ + public OutlierSmROCCurve(Pattern positive_class_name) { + super(); + this.positiveClassName = positive_class_name; + } + + private SmROCResult computeSmROCResult(SetDBIDs positiveids, OutlierResult or) { + Relation<Double> scores = or.getScores(); + final int size = scores.size(); + + // Compute mean, for inversion + double mean = 0.0; + for(DBIDIter iditer = scores.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); + mean += scores.get(id) / size; + } + + SmROCResult curve = new SmROCResult(positiveids.size() + 2); + + // start in bottom left + curve.add(0.0, 0.0); + + int poscnt = 0, negcnt = 0; + double prevscore = Double.NaN; + double x = 0, y = 0; + for (DBIDIter nei = or.getOrdering().iter(or.getOrdering().getDBIDs()).iter(); nei.valid(); nei.advance()) { + // Analyze next point + final DBID curid = nei.getDBID(); + final double curscore = scores.get(curid); + // defer calculation for ties + if(!Double.isNaN(prevscore) && (Double.compare(prevscore, curscore) == 0)) { + // positive or negative match? + if(positiveids.contains(curid)) { + poscnt += 1; + } + else { + negcnt += 1; + } + continue; + } + else { + // Add point for *previous* result (since we are no longer tied with it) + if(prevscore > mean) { + y += poscnt * prevscore + negcnt * (1.0 - prevscore); + x += poscnt * (1.0 - prevscore) + negcnt * prevscore; + } + else if(prevscore < mean) { + y += poscnt * (1.0 - prevscore) + negcnt * prevscore; + x += poscnt * prevscore + negcnt * (1.0 - prevscore); + } + curve.addAndSimplify(x, y); + // positive or negative match? + if(positiveids.contains(curid)) { + poscnt = 1; + negcnt = 0; + } + else { + poscnt = 0; + negcnt = 1; + } + prevscore = curscore; + } + } + // Last point + { + if(prevscore > mean) { + y += poscnt * prevscore + negcnt * (1.0 - prevscore); + x += poscnt * (1.0 - prevscore) + negcnt * prevscore; + } + else if(prevscore < mean) { + y += poscnt * (1.0 - prevscore) + negcnt * prevscore; + x += poscnt * prevscore + negcnt * (1.0 - prevscore); + } + curve.addAndSimplify(x, y); + } + + double rocauc = XYCurve.areaUnderCurve(curve) / (x * y); + if(logger.isVerbose()) { + logger.verbose(SMROCAUC_LABEL + ": " + rocauc); + } + curve.rocauc = rocauc; + + return curve; + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + Database db = ResultUtil.findDatabase(baseResult); + // Prepare + SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); + + if(positiveids.size() == 0) { + logger.warning("Computing a ROC curve failed - no objects matched."); + return; + } + + List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); + List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); + for(OutlierResult o : oresults) { + db.getHierarchy().add(o, computeSmROCResult(positiveids, o)); + orderings.remove(o.getOrdering()); + } + } + + /** + * Result object for Smooth ROC curves. + * + * @author Erich Schubert + */ + public static class SmROCResult extends XYCurve { + /** + * ROC AUC score + */ + double rocauc = Double.NaN; + + /** + * Constructor. + * + * @param size Size estimate + */ + public SmROCResult(int size) { + super("SmROC Negative", "SmROC Positive", size); + } + + @Override + public String getLongName() { + return "SmROC Curve"; + } + + @Override + public String getShortName() { + return "smroc-curve"; + } + + /** + * SmROC AUC value + * + * @return SmROC auc value + */ + public double getAUC() { + return rocauc; + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Pattern for positive class. + */ + protected Pattern positiveClassName = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter positiveClassNameP = new PatternParameter(OutlierROCCurve.POSITIVE_CLASS_NAME_ID); + if(config.grab(positiveClassNameP)) { + positiveClassName = positiveClassNameP.getValue(); + } + } + + @Override + protected OutlierSmROCCurve makeInstance() { + return new OutlierSmROCCurve(positiveClassName); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java new file mode 100644 index 00000000..e5d35424 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java @@ -0,0 +1,175 @@ +package de.lmu.ifi.dbs.elki.evaluation.outlier; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.scaling.IdentityScaling; +import de.lmu.ifi.dbs.elki.utilities.scaling.ScalingFunction; +import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction; + +/** + * Pseudo clustering algorithm that builds clusters based on their outlier + * score. Useful for transforming a numeric outlier score into a 2-class + * dataset. + * + * @author Erich Schubert + */ +public class OutlierThresholdClustering implements Evaluator { + /** + * Scaling function to use + */ + ScalingFunction scaling = null; + + /** + * Thresholds to use + */ + double[] threshold; + + /** + * Constructor. + * + * @param scaling Scaling function + * @param threshold Threshold + */ + public OutlierThresholdClustering(ScalingFunction scaling, double[] threshold) { + super(); + this.scaling = scaling; + this.threshold = threshold; + Arrays.sort(this.threshold); + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result newResult) { + List<OutlierResult> ors = ResultUtil.filterResults(newResult, OutlierResult.class); + for(OutlierResult or : ors) { + baseResult.getHierarchy().add(or, split(or)); + } + } + + private Clustering<Model> split(OutlierResult or) { + Relation<Double> scores = or.getScores(); + if(scaling instanceof OutlierScalingFunction) { + ((OutlierScalingFunction) scaling).prepare(or); + } + ArrayList<ModifiableDBIDs> idlists = new ArrayList<ModifiableDBIDs>(threshold.length + 1); + for(int i = 0; i <= threshold.length; i++) { + idlists.add(DBIDUtil.newHashSet()); + } + for(DBIDIter iter = scores.getDBIDs().iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); + double score = scores.get(id); + if(scaling != null) { + score = scaling.getScaled(score); + } + int i = 0; + for(; i < threshold.length; i++) { + if(score < threshold[i]) { + break; + } + } + idlists.get(i).add(id); + } + Clustering<Model> c = new Clustering<Model>("Outlier threshold clustering", "threshold-clustering"); + for(int i = 0; i <= threshold.length; i++) { + String name = (i == 0) ? "Inlier" : "Outlier_" + threshold[i - 1]; + c.addCluster(new Cluster<Model>(name, idlists.get(i), (i > 0))); + } + return c; + } + + /** + * Parameterization helper + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter to specify a scaling function to use. + * <p> + * Key: {@code -thresholdclust.scaling} + * </p> + */ + public static final OptionID SCALING_ID = OptionID.getOrCreateOptionID("thresholdclust.scaling", "Class to use as scaling function."); + + /** + * Parameter to specify the threshold + * <p> + * Key: {@code -thresholdclust.threshold} + * </p> + */ + public static final OptionID THRESHOLD_ID = OptionID.getOrCreateOptionID("thresholdclust.threshold", "Threshold(s) to apply."); + + /** + * Scaling function to use + */ + ScalingFunction scaling = null; + + /** + * Threshold to use + */ + double[] threshold; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<ScalingFunction> scalingP = new ObjectParameter<ScalingFunction>(SCALING_ID, ScalingFunction.class, IdentityScaling.class); + if(config.grab(scalingP)) { + scaling = scalingP.instantiateClass(config); + } + + DoubleListParameter thresholdP = new DoubleListParameter(THRESHOLD_ID); + if(config.grab(thresholdP)) { + threshold = ArrayLikeUtil.toPrimitiveDoubleArray(thresholdP.getValue()); + } + } + + @Override + protected OutlierThresholdClustering makeInstance() { + return new OutlierThresholdClustering(scaling, threshold); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/roc/ComputeROCCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/roc/ComputeROCCurve.java index 232089ed..fae03d24 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/roc/ComputeROCCurve.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/roc/ComputeROCCurve.java @@ -23,191 +23,31 @@ package de.lmu.ifi.dbs.elki.evaluation.roc; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; import java.util.regex.Pattern; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; -import de.lmu.ifi.dbs.elki.evaluation.Evaluator; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.result.CollectionResult; -import de.lmu.ifi.dbs.elki.result.HierarchicalResult; -import de.lmu.ifi.dbs.elki.result.OrderingResult; -import de.lmu.ifi.dbs.elki.result.Result; -import de.lmu.ifi.dbs.elki.result.ResultUtil; -import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierROCCurve; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleDoublePair; /** - * Compute a ROC curve to evaluate a ranking algorithm and compute the - * corresponding ROCAUC value. + * Compatibility stub. This class has been renamed. * - * The parameter {@code -rocauc.positive} specifies the class label of - * "positive" hits. - * - * The nested algorithm {@code -algorithm} will be run, the result will be - * searched for an iterable or ordering result, which then is compared with the - * clustering obtained via the given class label. + * TODO: remove at 0.6.0 release. * * @author Erich Schubert * - * @apiviz.landmark - * @apiviz.uses ROC - * @apiviz.has ROCResult oneway - - «create» + * @deprecated Renamed to {@link OutlierROCCurve}. */ -// TODO: maybe add a way to process clustering results as well? -public class ComputeROCCurve implements Evaluator { - /** - * The label we use for marking ROCAUC values. - */ - public static final String ROCAUC_LABEL = "ROCAUC"; - - /** - * The logger. - */ - static final Logging logger = Logging.getLogger(ComputeROCCurve.class); - - /** - * The pattern to identify positive classes. - * - * <p> - * Key: {@code -rocauc.positive} - * </p> - */ - public static final OptionID POSITIVE_CLASS_NAME_ID = OptionID.getOrCreateOptionID("rocauc.positive", "Class label for the 'positive' class."); - - /** - * Stores the "positive" class. - */ - private Pattern positiveClassName; - +@Deprecated +public class ComputeROCCurve extends OutlierROCCurve { /** * Constructor. * - * @param positive_class_name Positive class name pattern + * @param positive_class_name Pattern */ public ComputeROCCurve(Pattern positive_class_name) { - super(); - this.positiveClassName = positive_class_name; - } - - private ROCResult computeROCResult(int size, SetDBIDs positiveids, Iterator<DBID> iter) { - ArrayModifiableDBIDs order = DBIDUtil.newArray(size); - while(iter.hasNext()) { - Object o = iter.next(); - if(!(o instanceof DBID)) { - throw new IllegalStateException("Iterable result contained non-DBID - result didn't satisfy requirements"); - } - else { - order.add((DBID) o); - } - } - if(order.size() != size) { - throw new IllegalStateException("Iterable result doesn't match database size - incomplete ordering?"); - } - List<DoubleDoublePair> roccurve = ROC.materializeROC(size, positiveids, new ROC.SimpleAdapter(order.iterator())); - double rocauc = ROC.computeAUC(roccurve); - if(logger.isVerbose()) { - logger.verbose(ROCAUC_LABEL +": " + rocauc); - } - - List<String> header = new ArrayList<String>(1); - header.add(ROCAUC_LABEL+": " + rocauc); - final ROCResult rocresult = new ROCResult(roccurve, header, rocauc); - - return rocresult; - } - - private ROCResult computeROCResult(int size, SetDBIDs positiveids, OutlierResult or) { - List<DoubleDoublePair> roccurve = ROC.materializeROC(size, positiveids, new ROC.OutlierScoreAdapter(or)); - double rocauc = ROC.computeAUC(roccurve); - if(logger.isVerbose()) { - logger.verbose(ROCAUC_LABEL+": " + rocauc); - } - - List<String> header = new ArrayList<String>(1); - header.add(ROCAUC_LABEL+": " + rocauc); - final ROCResult rocresult = new ROCResult(roccurve, header, rocauc); - - return rocresult; - } - - @Override - public void processNewResult(HierarchicalResult baseResult, Result result) { - Database db = ResultUtil.findDatabase(baseResult); - // Prepare - SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); - - if (positiveids.size() == 0) { - logger.warning("Computing a ROC curve failed - no objects matched."); - return; - } - - boolean nonefound = true; - List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); - List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); - // Outlier results are the main use case. - for(OutlierResult o : oresults) { - db.getHierarchy().add(o, computeROCResult(o.getScores().size(), positiveids, o)); - // Process them only once. - orderings.remove(o.getOrdering()); - nonefound = false; - } - - // FIXME: find appropriate place to add the derived result - // otherwise apply an ordering to the database IDs. - for(OrderingResult or : orderings) { - Iterator<DBID> iter = or.iter(or.getDBIDs()); - db.getHierarchy().add(or, computeROCResult(or.getDBIDs().size(), positiveids, iter)); - nonefound = false; - } - - if(nonefound) { - return; - // logger.warning("No results found to process with ROC curve analyzer. Got "+iterables.size()+" iterables, "+orderings.size()+" orderings."); - } - } - - /** - * Result object for ROC curves. - * - * @author Erich Schubert - */ - public static class ROCResult extends CollectionResult<DoubleDoublePair> { - /** - * AUC value - */ - private double auc; - - /** - * Constructor. - * - * @param col roc curve - * @param header header - * @param rocauc ROC AUC value - */ - public ROCResult(Collection<DoubleDoublePair> col, Collection<String> header, double rocauc) { - super("ROC Curve", "roc", col, header); - this.auc = rocauc; - } - - /** - * @return the area under curve - */ - public double getAUC() { - return auc; - } + super(positive_class_name); } /** @@ -218,12 +58,15 @@ public class ComputeROCCurve implements Evaluator { * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { + /** + * Pattern for positive class. + */ protected Pattern positiveClassName = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); + PatternParameter positiveClassNameP = new PatternParameter(OutlierROCCurve.POSITIVE_CLASS_NAME_ID); if(config.grab(positiveClassNameP)) { positiveClassName = positiveClassNameP.getValue(); } @@ -234,4 +77,4 @@ public class ComputeROCCurve implements Evaluator { return new ComputeROCCurve(positiveClassName); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java b/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java index f7fdd254..d85cb137 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java @@ -23,13 +23,12 @@ package de.lmu.ifi.dbs.elki.evaluation.roc; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.ArrayList; import java.util.Iterator; -import java.util.List; import java.util.Set; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDPair; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; @@ -37,8 +36,8 @@ import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleDoublePair; import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; import de.lmu.ifi.dbs.elki.utilities.pairs.PairInterface; @@ -81,23 +80,16 @@ public class ROC { * 'same positions'. * @return area under curve */ - public static <C extends Comparable<? super C>, T> List<DoubleDoublePair> materializeROC(int size, Set<? super T> ids, Iterator<? extends PairInterface<C, T>> nei) { - int postot = ids.size(); - int negtot = size - postot; - int poscnt = 0; - int negcnt = 0; - ArrayList<DoubleDoublePair> res = new ArrayList<DoubleDoublePair>(postot + 2); + public static <C extends Comparable<? super C>, T> XYCurve materializeROC(int size, Set<? super T> ids, Iterator<? extends PairInterface<C, T>> nei) { + final int postot = ids.size(), negtot = size - postot; + int poscnt = 0, negcnt = 0; + XYCurve curve = new XYCurve("True Negative Rate", "True Positive Rate", postot + 2); // start in bottom left - res.add(new DoubleDoublePair(0.0, 0.0)); + curve.add(0.0, 0.0); - PairInterface<C, T> prev = null; + C prevval = null; while(nei.hasNext()) { - // Previous positive rate - y axis - double curpos = ((double) poscnt) / postot; - // Previous negative rate - x axis - double curneg = ((double) negcnt) / negtot; - // Analyze next point PairInterface<C, T> cur = nei.next(); // positive or negative match? @@ -108,46 +100,17 @@ public class ROC { negcnt += 1; } // defer calculation for ties - if((prev != null) && (prev.getFirst().compareTo(cur.getFirst()) == 0)) { + if((prevval != null) && (prevval.compareTo(cur.getFirst()) == 0)) { continue; } - // simplify curve when possible: - if(res.size() >= 2) { - DoubleDoublePair last1 = res.get(res.size() - 2); - DoubleDoublePair last2 = res.get(res.size() - 1); - final double ldx = last2.first - last1.first; - final double cdx = curneg - last2.first; - final double ldy = last2.second - last1.second; - final double cdy = curpos - last2.second; - // vertical simplification - if((ldx == 0) && (cdx == 0)) { - res.remove(res.size() - 1); - } - // horizontal simplification - else if((ldy == 0) && (cdy == 0)) { - res.remove(res.size() - 1); - } - // diagonal simplification - else if(ldy > 0 && cdy > 0) { - if(Math.abs((ldx / ldy) - (cdx / cdy)) < 1E-10) { - res.remove(res.size() - 1); - } - } - } - // Add a new point (for the previous entry!) - res.add(new DoubleDoublePair(curneg, curpos)); - prev = cur; + // Add a new point. + curve.addAndSimplify(negcnt / (double) negtot, poscnt / (double) postot); + prevval = cur.getFirst(); } - // ensure we end up in the top right corner. - // Since we didn't add a point for the last entry yet, this likely is - // needed. - { - DoubleDoublePair last = res.get(res.size() - 1); - if(last.first < 1.0 || last.second < 1.0) { - res.add(new DoubleDoublePair(1.0, 1.0)); - } - } - return res; + // Ensure we end up in the top right corner. + // Simplification will skip this if we already were. + curve.addAndSimplify(1.0, 1.0); + return curve; } /** @@ -162,23 +125,19 @@ public class ROC { * 'same positions'. * @return area under curve */ - public static <C extends Comparable<? super C>> List<DoubleDoublePair> materializeROC(int size, SetDBIDs ids, Iterator<? extends PairInterface<C, DBID>> nei) { - int postot = ids.size(); - int negtot = size - postot; - int poscnt = 0; - int negcnt = 0; - ArrayList<DoubleDoublePair> res = new ArrayList<DoubleDoublePair>(postot + 2); + public static <C extends Comparable<? super C>> XYCurve materializeROC(int size, SetDBIDs ids, Iterator<? extends PairInterface<C, DBID>> nei) { + final int postot = ids.size(), negtot = size - postot; + int poscnt = 0, negcnt = 0; + XYCurve curve = new XYCurve("True Negative Rate", "True Positive Rate", postot + 2); // start in bottom left - res.add(new DoubleDoublePair(0.0, 0.0)); + curve.add(0.0, 0.0); - PairInterface<C, DBID> prev = null; + C prevval = null; while(nei.hasNext()) { - // Previous positive rate - y axis - double curpos = ((double) poscnt) / postot; - // Previous negative rate - x axis - double curneg = ((double) negcnt) / negtot; - + // Rates at *previous* data point. Because of tie handling strategy! + final double trueneg = negcnt / (double) negtot; + final double truepos = poscnt / (double) postot; // Analyze next point PairInterface<C, DBID> cur = nei.next(); // positive or negative match? @@ -189,46 +148,17 @@ public class ROC { negcnt += 1; } // defer calculation for ties - if((prev != null) && (prev.getFirst().compareTo(cur.getFirst()) == 0)) { + if((prevval != null) && (prevval.compareTo(cur.getFirst()) == 0)) { continue; } - // simplify curve when possible: - if(res.size() >= 2) { - DoubleDoublePair last1 = res.get(res.size() - 2); - DoubleDoublePair last2 = res.get(res.size() - 1); - final double ldx = last2.first - last1.first; - final double cdx = curneg - last2.first; - final double ldy = last2.second - last1.second; - final double cdy = curpos - last2.second; - // vertical simplification - if((ldx == 0) && (cdx == 0)) { - res.remove(res.size() - 1); - } - // horizontal simplification - else if((ldy == 0) && (cdy == 0)) { - res.remove(res.size() - 1); - } - // diagonal simplification - else if(ldy > 0 && cdy > 0) { - if(Math.abs((ldx / ldy) - (cdx / cdy)) < 1E-10) { - res.remove(res.size() - 1); - } - } - } - // Add a new point (for the previous entry!) - res.add(new DoubleDoublePair(curneg, curpos)); - prev = cur; - } - // ensure we end up in the top right corner. - // Since we didn't add a point for the last entry yet, this likely is - // needed. - { - DoubleDoublePair last = res.get(res.size() - 1); - if(last.first < 1.0 || last.second < 1.0) { - res.add(new DoubleDoublePair(1.0, 1.0)); - } + // Add point for *previous* result (since we are no longer tied with it) + curve.addAndSimplify(trueneg, truepos); + prevval = cur.getFirst(); } - return res; + // Ensure we end up in the top right corner. + // Simplification will skip this if we already were. + curve.addAndSimplify(1.0, 1.0); + return curve; } /** @@ -245,27 +175,28 @@ public class ROC { /** * Original Iterator */ - private Iterator<DBID> iter; + private DBIDIter iter; /** * Constructor * * @param iter Iterator for object IDs */ - public SimpleAdapter(Iterator<DBID> iter) { + public SimpleAdapter(DBIDIter iter) { super(); this.iter = iter; } @Override public boolean hasNext() { - return this.iter.hasNext(); + return this.iter.valid(); } @Override public DBIDPair next() { - DBID id = this.iter.next(); - return DBIDUtil.newPair(id, id); + DBIDPair pair = DBIDUtil.newPair(iter, iter); + this.iter.advance(); + return pair; } @Override @@ -332,7 +263,7 @@ public class ROC { /** * Original Iterator */ - private Iterator<DBID> iter; + private DBIDIter iter; /** * Outlier score @@ -346,18 +277,19 @@ public class ROC { */ public OutlierScoreAdapter(OutlierResult o) { super(); - this.iter = o.getOrdering().iter(o.getScores().getDBIDs()); + this.iter = o.getOrdering().iter(o.getScores().getDBIDs()).iter(); this.scores = o.getScores(); } @Override public boolean hasNext() { - return this.iter.hasNext(); + return this.iter.valid(); } @Override public DoubleObjPair<DBID> next() { - DBID id = this.iter.next(); + DBID id = this.iter.getDBID(); + iter.advance(); return new DoubleObjPair<DBID>(scores.get(id), id); } @@ -368,38 +300,6 @@ public class ROC { } /** - * compute the Area Under Curve (difference to y axis) for an arbitrary - * polygon - * - * @param curve Iterable list of points (x,y) - * @return area und curve - */ - public static double computeAUC(Iterable<DoubleDoublePair> curve) { - double result = 0.0; - Iterator<DoubleDoublePair> iter = curve.iterator(); - // it doesn't make sense to speak about the "area under a curve" when there - // is no curve. - if(!iter.hasNext()) { - return Double.NaN; - } - // starting point - DoubleDoublePair prev = iter.next(); - // check there is at least a second point - if(!iter.hasNext()) { - return Double.NaN; - } - while(iter.hasNext()) { - DoubleDoublePair next = iter.next(); - // width * height at half way. - double width = next.first - prev.first; - double meanheight = (next.second + prev.second) / 2; - result += width * meanheight; - prev = next; - } - return result; - } - - /** * Compute a ROC curves Area-under-curve for a QueryResult and a Cluster. * * @param <D> Distance type @@ -424,8 +324,8 @@ public class ROC { */ public static <D extends Distance<D>> double computeROCAUCDistanceResult(int size, DBIDs ids, Iterable<? extends DistanceResultPair<D>> nei) { // TODO: do not materialize the ROC, but introduce an iterator interface - List<DoubleDoublePair> roc = materializeROC(size, DBIDUtil.ensureSet(ids), new DistanceResultAdapter<D>(nei.iterator())); - return computeAUC(roc); + XYCurve roc = materializeROC(size, DBIDUtil.ensureSet(ids), new DistanceResultAdapter<D>(nei.iterator())); + return XYCurve.areaUnderCurve(roc); } /** @@ -438,7 +338,7 @@ public class ROC { */ public static double computeROCAUCSimple(int size, DBIDs ids, DBIDs nei) { // TODO: do not materialize the ROC, but introduce an iterator interface - List<DoubleDoublePair> roc = materializeROC(size, DBIDUtil.ensureSet(ids), new SimpleAdapter(nei.iterator())); - return computeAUC(roc); + XYCurve roc = materializeROC(size, DBIDUtil.ensureSet(ids), new SimpleAdapter(nei.iter())); + return XYCurve.areaUnderCurve(roc); } -} +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java b/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java index 5344da15..0e5c7a02 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java @@ -27,7 +27,6 @@ import java.awt.image.BufferedImage; import java.awt.image.RenderedImage; import java.io.File; import java.io.IOException; -import java.util.Iterator; import java.util.List; import javax.imageio.ImageIO; @@ -37,6 +36,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -49,7 +49,6 @@ import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; -import de.lmu.ifi.dbs.elki.result.IterableResult; import de.lmu.ifi.dbs.elki.result.OrderingResult; import de.lmu.ifi.dbs.elki.result.PixmapResult; import de.lmu.ifi.dbs.elki.result.Result; @@ -124,10 +123,10 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { * @param iter DBID iterator * @return result object */ - private SimilarityMatrix computeSimilarityMatrixImage(Relation<O> relation, Iterator<DBID> iter) { + private SimilarityMatrix computeSimilarityMatrixImage(Relation<O> relation, DBIDIter iter) { ArrayModifiableDBIDs order = DBIDUtil.newArray(relation.size()); - while(iter.hasNext()) { - Object o = iter.next(); + for(; iter.valid(); iter.advance()) { + Object o = iter.getDBID(); if(!(o instanceof DBID)) { throw new IllegalStateException("Iterable result contained non-DBID - result didn't satisfy requirements"); } @@ -199,54 +198,27 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { return new SimilarityMatrix(img, relation, order); } - /** - * Wrap the uncheckable cast with the manual check. - * - * @param ir Interable result - * @return Iterator if Integer iterable, null otherwise. - */ - @SuppressWarnings("unchecked") - private Iterator<DBID> getDBIDIterator(IterableResult<?> ir) { - Iterator<?> testit = ir.iterator(); - if(testit.hasNext() && (testit.next() instanceof DBID)) { - // note: we DO want a fresh iterator here! - return (Iterator<DBID>) ir.iterator(); - } - return null; - } - @Override public void processNewResult(HierarchicalResult baseResult, Result result) { Database db = ResultUtil.findDatabase(baseResult); boolean nonefound = true; List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); - List<IterableResult<?>> iterables = ResultUtil.getIterableResults(result); List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); // Outlier results are the main use case. for(OutlierResult o : oresults) { final OrderingResult or = o.getOrdering(); Relation<O> relation = db.getRelation(distanceFunction.getInputTypeRestriction()); - db.getHierarchy().add(or, computeSimilarityMatrixImage(relation, or.iter(relation.getDBIDs()))); + db.getHierarchy().add(or, computeSimilarityMatrixImage(relation, or.iter(relation.getDBIDs()).iter())); // Process them only once. orderings.remove(or); nonefound = false; } - // try iterable results first - // FIXME: find the appropriate place to call addDerivedResult - for(IterableResult<?> ir : iterables) { - Iterator<DBID> iter = getDBIDIterator(ir); - if(iter != null) { - Relation<O> relation = db.getRelation(distanceFunction.getInputTypeRestriction()); - db.getHierarchy().add(ir, computeSimilarityMatrixImage(relation, iter)); - nonefound = false; - } - } // FIXME: find appropriate place to add the derived result // otherwise apply an ordering to the database IDs. for(OrderingResult or : orderings) { Relation<O> relation = db.getRelation(distanceFunction.getInputTypeRestriction()); - Iterator<DBID> iter = or.iter(relation.getDBIDs()); + DBIDIter iter = or.iter(relation.getDBIDs()).iter(); db.getHierarchy().add(or, computeSimilarityMatrixImage(relation, iter)); nonefound = false; } |