diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation')
66 files changed, 3699 insertions, 938 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java index fe2e1dc5..91202efe 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/AutomaticEvaluation.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -36,6 +36,7 @@ import de.lmu.ifi.dbs.elki.evaluation.histogram.ComputeOutlierHistogram; import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierPrecisionAtKCurve; import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierPrecisionRecallCurve; import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierROCCurve; +import de.lmu.ifi.dbs.elki.evaluation.outlier.OutlierRankingEvaluation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; import de.lmu.ifi.dbs.elki.result.Result; @@ -73,13 +74,13 @@ public class AutomaticEvaluation implements Evaluator { protected void autoEvaluateOutliers(HierarchicalResult baseResult, Result newResult) { Collection<OutlierResult> outliers = ResultUtil.filterResults(newResult, OutlierResult.class); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.debug("Number of new outlier results: " + outliers.size()); } - if (outliers.size() > 0) { + if(outliers.size() > 0) { ResultUtil.ensureClusteringResult(ResultUtil.findDatabase(baseResult), baseResult); Collection<Clustering<?>> clusterings = ResultUtil.filterResults(baseResult, Clustering.class); - if (clusterings.size() == 0) { + if(clusterings.size() == 0) { LOG.warning("Could not find a clustering result, even after running 'ensureClusteringResult'?!?"); return; } @@ -88,28 +89,30 @@ public class AutomaticEvaluation implements Evaluator { int min = Integer.MAX_VALUE; int total = 0; String label = null; - if (basec.getAllClusters().size() > 1) { - for (Cluster<?> c : basec.getAllClusters()) { + if(basec.getAllClusters().size() > 1) { + for(Cluster<?> c : basec.getAllClusters()) { final int csize = c.getIDs().size(); total += csize; - if (csize < min) { + if(csize < min) { min = csize; label = c.getName(); } } } - if (label == null) { + if(label == null) { LOG.warning("Could not evaluate outlier results, as I could not find a minority label."); return; } - if (min == 1) { + if(min == 1) { LOG.warning("The minority class label had a single object. Try using 'ClassLabelFilter' to identify the class label column."); } - if (min > 0.05 * total) { + if(min > 0.05 * total) { LOG.warning("The minority class I discovered (labeled '" + label + "') has " + (min * 100. / total) + "% of objects. Outlier classes should be more rare!"); } LOG.verbose("Evaluating using minority class: " + label); Pattern pat = Pattern.compile("^" + Pattern.quote(label) + "$"); + // Evaluate rankings. + new OutlierRankingEvaluation(pat).processNewResult(baseResult, newResult); // Compute ROC curve new OutlierROCCurve(pat).processNewResult(baseResult, newResult); // Compute Precision at k @@ -123,25 +126,29 @@ public class AutomaticEvaluation implements Evaluator { protected void autoEvaluateClusterings(HierarchicalResult baseResult, Result newResult) { Collection<Clustering<?>> clusterings = ResultUtil.filterResults(newResult, Clustering.class); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.warning("Number of new clustering results: " + clusterings.size()); } - for (Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext();) { + for(Iterator<Clustering<?>> c = clusterings.iterator(); c.hasNext();) { Clustering<?> test = c.next(); - if ("allinone-clustering".equals(test.getShortName())) { + if("allinone-clustering".equals(test.getShortName())) { c.remove(); - } else if ("allinnoise-clustering".equals(test.getShortName())) { + } + else if("allinnoise-clustering".equals(test.getShortName())) { c.remove(); - } else if ("bylabel-clustering".equals(test.getShortName())) { + } + else if("bylabel-clustering".equals(test.getShortName())) { c.remove(); - } else if ("bymodel-clustering".equals(test.getShortName())) { + } + else if("bymodel-clustering".equals(test.getShortName())) { c.remove(); } } - if (clusterings.size() > 0) { + if(clusterings.size() > 0) { try { new EvaluateClustering(new ByLabelClustering(), false, true).processNewResult(baseResult, newResult); - } catch (NoSupportedDataTypeException e) { + } + catch(NoSupportedDataTypeException e) { // Pass - the data probably did not have labels. } } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/Evaluator.java b/src/de/lmu/ifi/dbs/elki/evaluation/Evaluator.java index 140b29ef..b060af13 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/Evaluator.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/Evaluator.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java index 640a8c9d..7d7b4bed 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/NoAutomaticEvaluation.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/ConfusionMatrix.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/ConfusionMatrix.java new file mode 100644 index 00000000..e7b4cd4f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/ConfusionMatrix.java @@ -0,0 +1,359 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification; +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2014 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +import java.text.NumberFormat; +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; + +/** + * Provides a confusion matrix with some prediction performance measures that + * can be derived from a confusion matrix. + * + * @author Arthur Zimek + */ +public class ConfusionMatrix { + /** + * Holds the confusion matrix. Must be a square matrix. The rows (first index) + * give the values which classes are classified as row, the columns (second + * index) give the values the class col has been classified as. Thus, + * <code>confusion[predicted][real]</code> addresses the number of instances + * of class <i>real</i> that have been assigned to the class <i>predicted</i>. + */ + private int[][] confusion; + + /** + * Holds the class labels. + */ + private ArrayList<ClassLabel> labels; + + /** + * Provides a confusion matrix for the given values. + * + * @param labels the class labels - must conform the confusion matrix in + * length + * @param confusion the confusion matrix. Must be a square matrix. The rows + * (first index) give the values which classes are classified as row, + * the columns (second index) give the values the class col has been + * classified as. Thus, <code>confusion[predicted][real]</code> + * addresses the number of instances of class <i>real</i> that have + * been assigned to the class <i>predicted</i>. + * @throws IllegalArgumentException if the confusion matrix is not square or + * not complete or if the length of class labels does not conform the + * length of the confusion matrix + */ + public ConfusionMatrix(ArrayList<ClassLabel> labels, int[][] confusion) throws IllegalArgumentException { + for(int i = 0; i < confusion.length; i++) { + if(confusion.length != confusion[i].length) { + throw new IllegalArgumentException("Confusion matrix irregular: row-dimension = " + confusion.length + ", col-dimension in col" + i + " = " + confusion[i].length); + } + } + if(confusion.length != labels.size()) { + throw new IllegalArgumentException("Number of class labels does not match row dimension of confusion matrix."); + } + this.confusion = confusion; + this.labels = labels; + } + + /** + * Provides the <i>true positive rate</i>. Aka <i>accuracy</i> or + * <i>sensitivity</i> or <i>recall</i>: <code>TP / (TP+FN)</code>. + * + * + * @return the true positive rate + */ + public double truePositiveRate() { + return ((double) truePositives()) / (double) totalInstances(); + } + + /** + * Provides the <i>false positive rate</i> for the specified class. + * + * + * @param classindex the index of the class to retrieve the false positive + * rate for + * @return the false positive rate for the specified class + */ + public double falsePositiveRate(int classindex) { + int fp = falsePositives(classindex); + int tn = trueNegatives(classindex); + return ((double) fp) / ((double) (fp + tn)); + } + + /** + * Provides the <i>false positive rate</i>. Aka <i>false alarm rate</i>: + * <code>FP / (FP+TN)</code>. + * + * + * @return the false positive rate + */ + public double falsePositiveRate() { + double fpr = 0; + for(int i = 0; i < confusion.length; i++) { + fpr += falsePositiveRate(i) * colSum(i); + } + return fpr / totalInstances(); + } + + /** + * Provides the <i>positive predicted value</i> for the specified class. + * + * + * @param classindex the index of the class to retrieve the positive predicted + * value for + * @return the positive predicted value for the specified class + */ + public double positivePredictedValue(int classindex) { + int tp = truePositives(classindex); + return (double) tp / ((double) (tp + falsePositives(classindex))); + } + + /** + * Provides the <i>positive predicted value</i>. Aka <i>precision</i> or + * <i>specificity</i>: <code>TP / (TP+FP)</code>. + * + * @return the positive predicted value + */ + public double positivePredictedValue() { + double ppv = 0; + for(int i = 0; i < confusion.length; i++) { + ppv += positivePredictedValue(i) * colSum(i); + } + return ppv / totalInstances(); + } + + /** + * The number of correctly classified instances. + * + * + * @return the number of correctly classified instances + */ + public int truePositives() { + int tp = 0; + for(int i = 0; i < confusion.length; i++) { + tp += truePositives(i); + } + return tp; + } + + /** + * The number of correctly classified instances belonging to the specified + * class. + * + * + * @param classindex the index of the class to retrieve the correctly + * classified instances of + * @return the number of correctly classified instances belonging to the + * specified class + */ + public int truePositives(int classindex) { + return confusion[classindex][classindex]; + } + + /** + * Provides the <i>true positive rate</i> for the specified class. + * + * @param classindex the index of the class to retrieve the true positive rate + * for + * @return the true positive rate + */ + public double truePositiveRate(int classindex) { + int tp = truePositives(classindex); + return (double) tp / ((double) (tp + falseNegatives(classindex))); + } + + /** + * The number of true negatives of the specified class. + * + * + * @param classindex the index of the class to retrieve the true negatives for + * @return the number of true negatives of the specified class + */ + public int trueNegatives(int classindex) { + int tn = 0; + for(int i = 0; i < confusion.length; i++) { + for(int j = 0; j < confusion[i].length; j++) { + if(i != classindex && j != classindex) { + tn += confusion[i][j]; + } + } + } + return tn; + } + + /** + * The false positives for the specified class. + * + * + * @param classindex the index of the class to retrieve the false positives + * for + * @return the false positives for the specified class + */ + public int falsePositives(int classindex) { + int fp = 0; + for(int i = 0; i < confusion[classindex].length; i++) { + if(i != classindex) { + fp += confusion[classindex][i]; + } + } + return fp; + } + + /** + * The false negatives for the specified class. + * + * + * @param classindex the index of the class to retrieve the false negatives + * for + * @return the false negatives for the specified class + */ + public int falseNegatives(int classindex) { + int fn = 0; + for(int i = 0; i < confusion.length; i++) { + if(i != classindex) { + fn += confusion[i][classindex]; + } + } + return fn; + } + + /** + * The total number of instances covered by this confusion matrix. + * + * + * @return the total number of instances covered by this confusion matrix + */ + public int totalInstances() { + int total = 0; + for(int i = 0; i < confusion.length; i++) { + for(int j = 0; j < confusion[i].length; j++) { + total += confusion[i][j]; + } + } + return total; + } + + /** + * The number of instances present in the specified row. I.e., classified as + * class <code>classindex</code>. + * + * + * @param classindex the index of the class the resulting number of instances + * has been classified as + * @return the number of instances present in the specified row + */ + public int rowSum(int classindex) { + int s = 0; + for(int i = 0; i < confusion[classindex].length; i++) { + s += confusion[classindex][i]; + } + return s; + } + + /** + * The number of instances present in the specified column. I.e., the + * instances of class <code>classindex</code>. + * + * + * @param classindex the index of the class theresulting number of instances + * belongs to + * @return the number of instances present in the specified column + */ + public int colSum(int classindex) { + int s = 0; + for(int i = 0; i < confusion.length; i++) { + s += confusion[i][classindex]; + } + return s; + } + + /** + * The number of instances belonging to class <code>trueClassindex</code> and + * predicted as <code>predictedClassindex</code>. + * + * + * @param trueClassindex the true class index + * @param predictedClassindex the predicted class index + * @return the number of instances belonging to class + * <code>trueClassindex</code> and predicted as + * <code>predictedClassindex</code> + */ + public int value(int trueClassindex, int predictedClassindex) { + return confusion[predictedClassindex][trueClassindex]; + } + + /** + * Provides a String representation of this confusion matrix. + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + int max = 0; + for(int i = 0; i < confusion.length; i++) { + for(int j = 0; j < confusion[i].length; j++) { + if(confusion[i][j] > max) { + max = confusion[i][j]; + } + } + } + String classPrefix = "C_"; + NumberFormat nf = NumberFormat.getInstance(); + nf.setParseIntegerOnly(true); + int labelLength = Integer.toString(labels.size()).length(); + nf.setMaximumIntegerDigits(labelLength); + nf.setMinimumIntegerDigits(labelLength); + int cell = Math.max(Integer.toString(max).length(), labelLength + classPrefix.length()); + String separator = " "; + StringBuilder representation = new StringBuilder(); + for(int i = 1; i <= labels.size(); i++) { + representation.append(separator); + String label = classPrefix + nf.format(i); + int space = cell - labelLength - classPrefix.length(); + for(int s = 0; s <= space; s++) { + representation.append(' '); + } + representation.append(label); + } + representation.append('\n'); + for(int row = 0; row < confusion.length; row++) { + for(int col = 0; col < confusion[row].length; col++) { + representation.append(separator); + String entry = Integer.toString(confusion[row][col]); + int space = cell - entry.length(); + for(int s = 0; s <= space; s++) { + representation.append(' '); + } + representation.append(entry); + } + representation.append(separator); + representation.append(classPrefix); + representation.append(nf.format(row + 1)); + representation.append(": "); + representation.append(labels.get(row)); + representation.append('\n'); + } + return representation.toString(); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/ConfusionMatrixEvaluationResult.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/ConfusionMatrixEvaluationResult.java new file mode 100644 index 00000000..6bf0f648 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/ConfusionMatrixEvaluationResult.java @@ -0,0 +1,92 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; +import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; + +/** + * Provides the prediction performance measures for a classifier based on the + * confusion matrix. + * + * Note: this API is non-final, and will be refactored soon. + * + * @author Arthur Zimek + */ +public class ConfusionMatrixEvaluationResult implements Result, TextWriteable { + /** + * Holds the confusion matrix. + */ + private ConfusionMatrix confusionmatrix; + + /** + * Holds the used EvaluationProcedure. + */ + private String evaluationName; + + /** + * Provides an evaluation based on the given confusion matrix. + * + * @param confusionmatrix the confusion matrix to provide the prediction + * performance measures for + * @param evaluationName name of the evaluation procedure used + */ + public ConfusionMatrixEvaluationResult(ConfusionMatrix confusionmatrix, String evaluationName) { + super(); + this.confusionmatrix = confusionmatrix; + this.evaluationName = evaluationName; + } + + @Override + public void writeToText(TextWriterStream out, String label) { + out.commentPrintLn("Evaluation:"); + out.commentPrintLn(evaluationName); + // out.println(evaluationProcedure.setting()); + out.commentPrintLn("Accuracy: \n correctly classified instances: "); + out.commentPrintLn(confusionmatrix.truePositives()); + out.commentPrintLn("true positive rate: "); + double tpr = confusionmatrix.truePositiveRate(); + out.commentPrintLn(tpr); + out.commentPrintLn("false positive rate: "); + out.commentPrintLn(confusionmatrix.falsePositiveRate()); + out.commentPrintLn("positive predicted value: "); + double ppv = confusionmatrix.positivePredictedValue(); + out.commentPrintLn(ppv); + out.commentPrintLn("F1-measure: "); + out.commentPrintLn((2 * ppv * tpr) / (ppv + tpr)); + out.commentPrintLn("\nconfusion matrix:\n"); + out.commentPrintLn(confusionmatrix.toString()); + } + + @Override + public String getLongName() { + return "confusionmatrixresult"; + } + + @Override + public String getShortName() { + return "confusionmatrixresult"; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/AbstractHoldout.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/AbstractHoldout.java new file mode 100644 index 00000000..af10eac0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/AbstractHoldout.java @@ -0,0 +1,118 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; + +/** + * Split a data set for holdout evaluation. + * + * @author Erich Schubert + */ +public abstract class AbstractHoldout implements Holdout { + /** + * Labels in the current data set. + */ + protected ArrayList<ClassLabel> labels; + + /** + * Column containing the class labels. + */ + protected int labelcol; + + /** + * Input data bundle. + */ + protected MultipleObjectsBundle bundle; + + @Override + public void initialize(MultipleObjectsBundle bundle) { + this.bundle = bundle; + this.labelcol = findClassLabelColumn(bundle); + this.labels = allClassLabels(bundle); + } + + @Override + public ArrayList<ClassLabel> getLabels() { + return labels; + } + + /** + * Find the class label column in the given data set. + * + * @param bundle Bundle + * @return Class label column + */ + public static int findClassLabelColumn(MultipleObjectsBundle bundle) { + for(int i = 0, l = bundle.metaLength(); i < l; ++i) { + if(TypeUtil.CLASSLABEL.isAssignableFromType(bundle.meta(i))) { + return i; + } + } + return -1; + } + + /** + * Get an array of all class labels in a given data set. + * + * @param bundle Bundle + * @return Class labels. + */ + public static ArrayList<ClassLabel> allClassLabels(MultipleObjectsBundle bundle) { + int col = findClassLabelColumn(bundle); + // TODO: automatically infer class labels? + if(col < 0) { + throw new AbortException("No class label found (try using ClassLabelFilter)."); + } + return allClassLabels(bundle, col); + } + + /** + * Get an array of all class labels in a given data set. + * + * @param bundle Bundle + * @param col Column + * @return Class labels. + */ + public static ArrayList<ClassLabel> allClassLabels(MultipleObjectsBundle bundle, int col) { + HashSet<ClassLabel> labels = new HashSet<ClassLabel>(); + for(int i = 0, l = bundle.dataLength(); i < l; ++i) { + Object o = bundle.data(i, col); + if(o == null || !(o instanceof ClassLabel)) { + continue; + } + labels.add((ClassLabel) o); + } + ArrayList<ClassLabel> ret = new ArrayList<>(labels); + Collections.sort(ret); + return ret; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/DisjointCrossValidation.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/DisjointCrossValidation.java new file mode 100644 index 00000000..75503018 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/DisjointCrossValidation.java @@ -0,0 +1,144 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * DisjointCrossValidationHoldout provides a set of partitions of a database to + * perform cross-validation. The test sets are guaranteed to be disjoint. + * + * @author Arthur Zimek + */ +public class DisjointCrossValidation extends RandomizedHoldout { + /** + * Holds the number of folds, current fold. + */ + protected int nfold, fold; + + /** + * Partition assignment and size. + */ + protected int[] assignment, sizes; + + /** + * Constructor. + * + * @param random Random seeding + * @param nfold Number of folds. + */ + public DisjointCrossValidation(RandomFactory random, int nfold) { + super(random); + this.nfold = nfold; + } + + @Override + public void initialize(MultipleObjectsBundle bundle) { + super.initialize(bundle); + fold = 0; + + Random rnd = random.getSingleThreadedRandom(); + sizes = new int[nfold]; + assignment = new int[bundle.dataLength()]; + for(int i = 0; i < assignment.length; ++i) { + int p = rnd.nextInt(nfold); + assignment[i] = p; + ++sizes[p]; + } + } + + @Override + public int numberOfPartitions() { + return nfold; + } + + @Override + public TrainingAndTestSet nextPartitioning() { + if(fold >= nfold) { + return null; + } + final int tesize = sizes[fold], trsize = bundle.dataLength() - tesize; + MultipleObjectsBundle training = new MultipleObjectsBundle(); + MultipleObjectsBundle test = new MultipleObjectsBundle(); + // Process column-wise. + for(int c = 0, cs = bundle.metaLength(); c < cs; ++c) { + ArrayList<Object> tr = new ArrayList<>(trsize), te = new ArrayList<>(tesize); + for(int i = 0; i < bundle.dataLength(); ++i) { + ((assignment[i] != fold) ? tr : te).add(bundle.data(i, c)); + } + training.appendColumn(bundle.meta(c), tr); + test.appendColumn(bundle.meta(c), te); + } + + ++fold; + return new TrainingAndTestSet(training, test, labels); + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends RandomizedHoldout.Parameterizer { + /** + * Default number of folds. + */ + public static final int N_DEFAULT = 10; + + /** + * Parameter for number of folds. + */ + public static final OptionID NFOLD_ID = new OptionID("nfold", "Number of folds for cross-validation."); + + /** + * Holds the number of folds. + */ + protected int nfold = N_DEFAULT; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter nfoldP = new IntParameter(NFOLD_ID, N_DEFAULT)// + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(nfoldP)) { + nfold = nfoldP.intValue(); + } + } + + @Override + protected DisjointCrossValidation makeInstance() { + return new DisjointCrossValidation(random, nfold); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/Holdout.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/Holdout.java new file mode 100644 index 00000000..17cd56ac --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/Holdout.java @@ -0,0 +1,67 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; + +/** + * A holdout procedure is to provide a range of partitions of a database to + * pairs of training and test data sets. + * + * @author Erich Schubert + */ +public interface Holdout { + /** + * Initialize the holdout procedure for a data set. + * + * @param bundle Data set bundle + */ + void initialize(MultipleObjectsBundle bundle); + + /** + * Get the next partitioning of the given holdout. + * + * @return Next partitioning of the data set + */ + TrainingAndTestSet nextPartitioning(); + + /** + * Get the <i>sorted</i> class labels present in this data set. + * + * For indexing into assignment arrays. + * + * @return Class labels + */ + ArrayList<ClassLabel> getLabels(); + + /** + * How many partitions to test. + * + * @return Number of partitions. + */ + int numberOfPartitions(); +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/LeaveOneOut.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/LeaveOneOut.java new file mode 100644 index 00000000..1059f0a1 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/LeaveOneOut.java @@ -0,0 +1,82 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; + +/** + * A leave-one-out-holdout is to provide a set of partitions of a database where + * each instances once hold out as a test instance while the respectively + * remaining instances are training instances. + * + * @author Arthur Zimek + */ +public class LeaveOneOut extends AbstractHoldout { + /** + * Size of the data set. + */ + private int len, pos; + + /** + * Constructor. + */ + public LeaveOneOut() { + super(); + } + + @Override + public void initialize(MultipleObjectsBundle bundle) { + super.initialize(bundle); + len = bundle.dataLength(); + pos = 0; + } + + @Override + public int numberOfPartitions() { + return len; + } + + @Override + public TrainingAndTestSet nextPartitioning() { + if(pos >= len) { + return null; + } + MultipleObjectsBundle training = new MultipleObjectsBundle(); + MultipleObjectsBundle test = new MultipleObjectsBundle(); + // Process column-wise. + for(int c = 0, cs = bundle.metaLength(); c < cs; ++c) { + ArrayList<Object> tr = new ArrayList<>(len - 1), te = new ArrayList<>(1); + for(int i = 0; i < bundle.dataLength(); ++i) { + ((i != pos) ? tr : te).add(bundle.data(i, c)); + } + training.appendColumn(bundle.meta(c), tr); + test.appendColumn(bundle.meta(c), te); + } + + ++pos; + return new TrainingAndTestSet(training, test, labels); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/RandomizedCrossValidation.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/RandomizedCrossValidation.java new file mode 100644 index 00000000..99607398 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/RandomizedCrossValidation.java @@ -0,0 +1,141 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * RandomizedCrossValidationHoldout provides a set of partitions of a database + * to perform cross-validation. The test sets are not guaranteed to be disjoint. + * + * @author Arthur Zimek + */ +public class RandomizedCrossValidation extends RandomizedHoldout { + /** + * Holds the number of folds, current fold. + */ + protected int nfold, fold; + + /** + * Constructor for n-fold cross-validation. + * + * @param random Random seed + * @param nfold Number of folds + */ + public RandomizedCrossValidation(RandomFactory random, int nfold) { + super(random); + this.nfold = nfold; + } + + @Override + public void initialize(MultipleObjectsBundle bundle) { + super.initialize(bundle); + this.fold = 0; + } + + @Override + public int numberOfPartitions() { + return nfold; + } + + @Override + public TrainingAndTestSet nextPartitioning() { + if(fold >= nfold) { + return null; + } + MultipleObjectsBundle training = new MultipleObjectsBundle(); + MultipleObjectsBundle test = new MultipleObjectsBundle(); + Random rnd = random.getRandom(); + int datalen = bundle.dataLength(); + + boolean[] assignment = new boolean[datalen]; + int trsize = 0, tesize = 0; + for(int i = 0; i < assignment.length; ++i) { + boolean p = rnd.nextInt(nfold) < nfold - 1; + assignment[i] = p; + @SuppressWarnings("unused") + int discard = p ? ++trsize : ++tesize; + } + // Process column-wise. + for(int c = 0, cs = bundle.metaLength(); c < cs; ++c) { + ArrayList<Object> tr = new ArrayList<>(trsize), te = new ArrayList<>(tesize); + for(int i = 0; i < datalen; ++i) { + (assignment[i] ? tr : te).add(bundle.data(i, c)); + } + training.appendColumn(bundle.meta(c), tr); + test.appendColumn(bundle.meta(c), te); + } + + ++fold; + return new TrainingAndTestSet(training, test, labels); + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends RandomizedHoldout.Parameterizer { + /** + * Parameter for number of folds. + */ + public static final OptionID NFOLD_ID = new OptionID("nfold", "positive number of folds for cross-validation"); + + /** + * Default number of folds. + */ + public static final int N_DEFAULT = 10; + + /** + * Holds the number of folds. + */ + protected int nfold; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter nfoldP = new IntParameter(NFOLD_ID)// + .setDefaultValue(N_DEFAULT) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(nfoldP)) { + nfold = nfoldP.intValue(); + } + } + + @Override + protected RandomizedCrossValidation makeInstance() { + return new RandomizedCrossValidation(random, nfold); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/RandomizedHoldout.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/RandomizedHoldout.java new file mode 100644 index 00000000..6202af48 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/RandomizedHoldout.java @@ -0,0 +1,78 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * A holdout providing a seed for randomized operations. + * + * @author Arthur Zimek + */ +public abstract class RandomizedHoldout extends AbstractHoldout { + /** + * The random generator. + */ + protected RandomFactory random; + + /** + * Sets the parameter seed to the parameterToDescription map. + */ + public RandomizedHoldout(RandomFactory random) { + super(); + this.random = random; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static abstract class Parameterizer extends AbstractParameterizer { + /** + * Random seeding for holdout evaluation. + */ + public static final OptionID SEED_ID = new OptionID("holdout.seed", "Random generator seed for holdout evaluation."); + + /** + * The random generator. + */ + protected RandomFactory random; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + RandomParameter seedP = new RandomParameter(SEED_ID); + if(config.grab(seedP)) { + random = seedP.getValue(); + } + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/StratifiedCrossValidation.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/StratifiedCrossValidation.java new file mode 100644 index 00000000..40de7e88 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/StratifiedCrossValidation.java @@ -0,0 +1,162 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.list.array.TIntArrayList; + +import java.util.ArrayList; +import java.util.Collections; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * A stratified n-fold crossvalidation to distribute the data to n buckets where + * each bucket exhibits approximately the same distribution of classes as does + * the complete data set. The buckets are disjoint. The distribution is + * deterministic. + * + * @author Arthur Zimek + */ +public class StratifiedCrossValidation extends AbstractHoldout { + /** + * Holds the number of folds, current fold. + */ + protected int nfold, fold; + + /** + * Partition assignment, sizes + */ + protected int[] assignment, sizes; + + /** + * Provides a stratified crossvalidation. Setting parameter N_P to the + * OptionHandler. + */ + public StratifiedCrossValidation(int nfold) { + super(); + this.nfold = nfold; + } + + @Override + public int numberOfPartitions() { + return nfold; + } + + @Override + public void initialize(MultipleObjectsBundle bundle) { + super.initialize(bundle); + fold = 0; + TIntArrayList[] classBuckets = new TIntArrayList[this.labels.size()]; + for(int i = 0; i < this.labels.size(); i++) { + classBuckets[i] = new TIntArrayList(); + } + for(int i = 0, l = bundle.dataLength(); i < l; ++i) { + ClassLabel label = (ClassLabel) bundle.data(i, labelcol); + if(label == null) { + throw new AbortException("Unlabeled instances currently not supported."); + } + int classIndex = Collections.binarySearch(labels, label); + if(classIndex < 0) { + throw new AbortException("Label not in label list: " + label); + } + classBuckets[classIndex].add(i); + } + // TODO: shuffle the class buckets? + sizes = new int[nfold]; + assignment = new int[bundle.dataLength()]; + for(TIntArrayList bucket : classBuckets) { + for(int i = 0; i < bucket.size(); i++) { + assignment[bucket.get(i)] = i % nfold; + } + } + } + + @Override + public TrainingAndTestSet nextPartitioning() { + if(fold >= nfold) { + return null; + } + final int tesize = sizes[fold], trsize = bundle.dataLength() - tesize; + MultipleObjectsBundle training = new MultipleObjectsBundle(); + MultipleObjectsBundle test = new MultipleObjectsBundle(); + // Process column-wise. + for(int c = 0, cs = bundle.metaLength(); c < cs; ++c) { + ArrayList<Object> tr = new ArrayList<>(trsize), te = new ArrayList<>(tesize); + for(int i = 0; i < bundle.dataLength(); ++i) { + ((assignment[i] != fold) ? tr : te).add(bundle.data(i, c)); + } + training.appendColumn(bundle.meta(c), tr); + test.appendColumn(bundle.meta(c), te); + } + + ++fold; + return new TrainingAndTestSet(training, test, labels); + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Default number of folds. + */ + public static final int N_DEFAULT = 10; + + /** + * Parameter for number of folds. + */ + public static final OptionID NFOLD_ID = new OptionID("nfold", "Number of folds for cross-validation"); + + /** + * Holds the number of folds. + */ + protected int nfold; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter nfoldP = new IntParameter(NFOLD_ID, N_DEFAULT)// + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(nfoldP)) { + nfold = nfoldP.intValue(); + } + } + + @Override + protected StratifiedCrossValidation makeInstance() { + return new StratifiedCrossValidation(nfold); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/TrainingAndTestSet.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/TrainingAndTestSet.java new file mode 100644 index 00000000..a8725e1c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/TrainingAndTestSet.java @@ -0,0 +1,89 @@ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; + +/** + * Wrapper to hold a pair of training and test data sets. The labels of both + * training and test set are provided in labels. + * + * @author Arthur Zimek + */ +public class TrainingAndTestSet { + /** + * The overall labels. + */ + private ArrayList<ClassLabel> labels; + + /** + * The training data. + */ + private MultipleObjectsBundle training; + + /** + * The test data. + */ + private MultipleObjectsBundle test; + + /** + * Provides a pair of training and test data sets out of the given two + * databases. + */ + public TrainingAndTestSet(MultipleObjectsBundle training, MultipleObjectsBundle test, ArrayList<ClassLabel> labels) { + this.training = training; + this.test = test; + this.labels = labels; + } + + /** + * Returns the test data set. + * + * @return the test data set + */ + public MultipleObjectsBundle getTest() { + return test; + } + + /** + * Returns the training data set. + * + * @return the training data set + */ + public MultipleObjectsBundle getTraining() { + return training; + } + + /** + * Returns all labels present in the data set. + * + * @return all labels + */ + public ArrayList<ClassLabel> getLabels() { + return labels; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/package-info.java new file mode 100644 index 00000000..a469dfc3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/holdout/package-info.java @@ -0,0 +1,27 @@ +/** + * Holdout and cross-validation strategies for evaluating classifiers. + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.evaluation.classification.holdout; + diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/classification/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/classification/package-info.java new file mode 100644 index 00000000..a6e3f484 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/classification/package-info.java @@ -0,0 +1,27 @@ +/** + * Evaluation of classification algorithms. + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.evaluation.classification; + diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java index 3353b593..1c84f0e1 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/BCubed.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java index c3f8129f..d5dec735 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/ClusterContingencyTable.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -22,16 +22,15 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.BitSet; import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; /** * Class storing the contingency table and related data on two clusterings. @@ -59,14 +58,9 @@ public class ClusterContingencyTable { protected boolean selfPairing = true; /** - * Number of clusters in first + * Number of clusters. */ - protected int size1 = -1; - - /** - * Number of clusters in second - */ - protected int size2 = -1; + protected int size1 = -1, size2 = -1; /** * Contingency matrix @@ -76,12 +70,7 @@ public class ClusterContingencyTable { /** * Noise flags */ - protected BitSet noise1 = null; - - /** - * Noise flags - */ - protected BitSet noise2 = null; + protected long[] noise1 = null, noise2 = null; /** * Pair counting measures @@ -135,46 +124,39 @@ public class ClusterContingencyTable { size1 = cs1.size(); size2 = cs2.size(); contingency = new int[size1 + 2][size2 + 2]; - noise1 = new BitSet(size1); - noise2 = new BitSet(size2); + noise1 = BitsUtil.zero(size1); + noise2 = BitsUtil.zero(size2); // Fill main part of matrix { - { - final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); - for(int i2 = 0; it2.hasNext(); i2++) { - final Cluster<?> c2 = it2.next(); - if(c2.isNoise()) { - noise2.set(i2); - } - contingency[size1 + 1][i2] = c2.size(); - contingency[size1 + 1][size2] += c2.size(); + final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); + for(int i2 = 0; it2.hasNext(); i2++) { + final Cluster<?> c2 = it2.next(); + if(c2.isNoise()) { + BitsUtil.setI(noise2, i2); } + contingency[size1 + 1][i2] = c2.size(); + contingency[size1 + 1][size2] += c2.size(); } - final Iterator<? extends Cluster<?>> it1 = cs1.iterator(); - for(int i1 = 0; it1.hasNext(); i1++) { - final Cluster<?> c1 = it1.next(); - if(c1.isNoise()) { - noise1.set(i1); - } - final DBIDs ids = DBIDUtil.ensureSet(c1.getIDs()); - contingency[i1][size2 + 1] = c1.size(); - contingency[size1][size2 + 1] += c1.size(); - - final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); - for(int i2 = 0; it2.hasNext(); i2++) { - final Cluster<?> c2 = it2.next(); - int count = 0; - for(DBIDIter iter = c2.getIDs().iter(); iter.valid(); iter.advance()) { - if(ids.contains(iter)) { - count++; - } - } - contingency[i1][i2] = count; - contingency[i1][size2] += count; - contingency[size1][i2] += count; - contingency[size1][size2] += count; - } + } + final Iterator<? extends Cluster<?>> it1 = cs1.iterator(); + for(int i1 = 0; it1.hasNext(); i1++) { + final Cluster<?> c1 = it1.next(); + if(c1.isNoise()) { + BitsUtil.setI(noise1, i1); + } + final DBIDs ids = DBIDUtil.ensureSet(c1.getIDs()); + contingency[i1][size2 + 1] = c1.size(); + contingency[size1][size2 + 1] += c1.size(); + + final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); + for(int i2 = 0; it2.hasNext(); i2++) { + final Cluster<?> c2 = it2.next(); + int count = DBIDUtil.intersectionSize(ids, c2.getIDs()); + contingency[i1][i2] = count; + contingency[i1][size2] += count; + contingency[size1][i2] += count; + contingency[size1][size2] += count; } } } @@ -196,9 +178,6 @@ public class ClusterContingencyTable { buf.append('\n'); } } - // if(pairconfuse != null) { - // buf.append(FormatUtil.format(pairconfuse)); - // } return buf.toString(); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java index 6bbda98a..41b69de6 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EditDistance.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java index 58f19f65..654cecbe 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/Entropy.java @@ -1,9 +1,10 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -36,7 +37,10 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * * @author Sascha Goldhofer */ -@Reference(authors = "Meilă, M.", title = "Comparing clusterings by the variation of information", booktitle = "Learning theory and kernel machines", url = "http://dx.doi.org/10.1007/978-3-540-45167-9_14") +@Reference(authors = "Meilă, M.", // +title = "Comparing clusterings by the variation of information", // +booktitle = "Learning theory and kernel machines", // +url = "http://dx.doi.org/10.1007/978-3-540-45167-9_14") public class Entropy { /** * Entropy in first @@ -105,8 +109,8 @@ public class Entropy { } /** - * Get the entropy of the second clustering using Log_2. (not normalized, 0 - * = equal) + * Get the entropy of the second clustering using Log_2. (not normalized, 0 = + * equal) * * @return Entropy of second clustering */ @@ -144,8 +148,7 @@ public class Entropy { } /** - * Get Powers entropy (normalized, 0 = equal) - * Powers = 1 - NMI_Sum + * Get Powers entropy (normalized, 0 = equal) Powers = 1 - NMI_Sum * * @return Powers */ @@ -168,7 +171,7 @@ public class Entropy { * @return Joint Normalized Mutual information */ public double entropyNMIJoint() { - if (entropyJoint() == 0) { + if(entropyJoint() == 0) { return 0; } return (entropyMutualInformation() / entropyJoint()); @@ -207,7 +210,7 @@ public class Entropy { * @return Sqrt Normalized Mutual information */ public double entropyNMISqrt() { - if (entropyFirst() * entropySecond() <= 0) { + if(entropyFirst() * entropySecond() <= 0) { return entropyMutualInformation(); } return (entropyMutualInformation() / Math.sqrt(entropyFirst() * entropySecond())); @@ -223,20 +226,23 @@ public class Entropy { } /** - * Get the normalized variation of information (normalized, 0 = equal) - * NVI = 1 - NMI_Joint + * Get the normalized variation of information (normalized, 0 = equal) NVI = 1 + * - NMI_Joint * * <p> - * Vinh, N.X. and Epps, J. and Bailey, J.<br /> - * Information theoretic measures for clusterings comparison: is a - * correction for chance necessary?<br /> - * In: Proc. ICML '09 Proceedings of the 26th Annual International - * Conference on Machine Learning + * Nguyen, X. V. and Epps, J. and Bailey, J.<br /> + * Information theoretic measures for clusterings comparison: is a correction + * for chance necessary?<br /> + * In: Proc. ICML '09 Proceedings of the 26th Annual International Conference + * on Machine Learning * </p> * * @return Normalized Variation of information */ - @Reference(authors = "Vinh, N.X. and Epps, J. and Bailey, J.", title = "Information theoretic measures for clusterings comparison: is a correction for chance necessary?", booktitle = "Proc. ICML '09 Proceedings of the 26th Annual International Conference on Machine Learning", url = "http://dx.doi.org/10.1145/1553374.1553511") + @Reference(authors = "Nguyen, X. V. and Epps, J. and Bailey, J.", // + title = "Information theoretic measures for clusterings comparison: is a correction for chance necessary?", // + booktitle = "Proc. ICML '09 Proceedings of the 26th Annual International Conference on Machine Learning", // + url = "http://dx.doi.org/10.1145/1553374.1553511") public double normalizedVariationOfInformation() { return (1.0 - (entropyMutualInformation() / entropyJoint())); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java index 6db25736..2a67dbe2 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/EvaluateClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,12 +32,12 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.result.BasicResult; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.EvaluationResult; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultUtil; -import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; -import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -156,7 +156,9 @@ public class EvaluateClustering implements Evaluator { ClusterContingencyTable contmat = new ClusterContingencyTable(selfPairing, noiseSpecialHandling); contmat.process(refc, c); - db.getHierarchy().add(c, new ScoreResult(contmat)); + ScoreResult sr = new ScoreResult(contmat); + sr.addHeader(c.getLongName()); + db.getHierarchy().add(c, sr); } } @@ -184,7 +186,7 @@ public class EvaluateClustering implements Evaluator { * * @apiviz.composedOf ClusterContingencyTable */ - public static class ScoreResult extends BasicResult implements TextWriteable { + public static class ScoreResult extends EvaluationResult { /** * Cluster contingency table */ @@ -198,6 +200,43 @@ public class EvaluateClustering implements Evaluator { public ScoreResult(ClusterContingencyTable contmat) { super("Cluster-Evalation", "cluster-evaluation"); this.contmat = contmat; + + PairCounting paircount = contmat.getPaircount(); + MeasurementGroup g = newGroup("Pair counting measures"); + g.addMeasure("Jaccard", paircount.jaccard(), 0, 1, false); + g.addMeasure("F1-Measure", paircount.f1Measure(), 0, 1, false); + g.addMeasure("Precision", paircount.precision(), 0, 1, false); + g.addMeasure("Recall", paircount.recall(), 0, 1, false); + g.addMeasure("Rand", paircount.randIndex(), 0, 1, false); + g.addMeasure("ARI", paircount.adjustedRandIndex(), 0, 1, false); + g.addMeasure("FowlkesMallows", paircount.fowlkesMallows(), 0, 1, false); + + Entropy entropy = contmat.getEntropy(); + g = newGroup("Entropy based measures"); + g.addMeasure("NMI Joint", entropy.entropyNMIJoint(), 0, 1, false); + g.addMeasure("NMI Sqrt", entropy.entropyNMISqrt(), 0, 1, false); + + BCubed bcubed = contmat.getBCubed(); + g = newGroup("BCubed-based measures"); + g.addMeasure("F1-Measure", bcubed.f1Measure(), 0, 1, false); + g.addMeasure("Recall", bcubed.recall(), 0, 1, false); + g.addMeasure("Precision", bcubed.precision(), 0, 1, false); + + SetMatchingPurity setm = contmat.getSetMatching(); + g = newGroup("Set-Matching-based measures"); + g.addMeasure("F1-Measure", setm.f1Measure(), 0, 1, false); + g.addMeasure("Purity", setm.purity(), 0, 1, false); + g.addMeasure("Inverse Purity", setm.inversePurity(), 0, 1, false); + + EditDistance edit = contmat.getEdit(); + g = newGroup("Editing-distance measures"); + g.addMeasure("F1-Measure", edit.f1Measure(), 0, 1, false); + g.addMeasure("Precision", edit.editDistanceFirst(), 0, 1, false); + g.addMeasure("Recall", edit.editDistanceSecond(), 0, 1, false); + + MeanVariance gini = contmat.averageSymmetricGini(); + g = newGroup("Gini measures"); + g.addMeasure("Mean +-" + FormatUtil.NF4.format(gini.getCount() > 1. ? gini.getSampleStddev() : 0.), gini.getMean(), 0, 1, false); } /** @@ -208,47 +247,6 @@ public class EvaluateClustering implements Evaluator { public ClusterContingencyTable getContingencyTable() { return contmat; } - - @Override - public void writeToText(TextWriterStream out, String label) { - out.commentPrint("Pair-F1, "); - out.commentPrint("Pair-Precision, "); - out.commentPrint("Pair-Recall, "); - out.commentPrint("Pair-Rand, "); - out.commentPrint("Pair-AdjustedRand, "); - out.commentPrint("Pair-FowlkesMallows, "); - out.commentPrint("Pair-Jaccard, "); - out.commentPrint("Pair-Mirkin, "); - out.commentPrint("Entropy-VI, "); - out.commentPrint("Entropy-NormalizedVI, "); - out.commentPrint("Entropy-F1, "); - out.commentPrint("Edit-F1, "); - out.commentPrint("SM-InvPurity, "); - out.commentPrint("SM-Purity, "); - out.commentPrint("SM-F1, "); - out.commentPrint("BCubed-Precision, "); - out.commentPrint("BCubed-Recall, "); - out.commentPrint("BCubed-F1"); - out.flush(); - out.inlinePrint(contmat.getPaircount().f1Measure()); - out.inlinePrint(contmat.getPaircount().precision()); - out.inlinePrint(contmat.getPaircount().recall()); - out.inlinePrint(contmat.getPaircount().randIndex()); - out.inlinePrint(contmat.getPaircount().adjustedRandIndex()); - out.inlinePrint(contmat.getPaircount().fowlkesMallows()); - out.inlinePrint(contmat.getPaircount().jaccard()); - out.inlinePrint(contmat.getPaircount().mirkin()); - out.inlinePrint(contmat.getEntropy().variationOfInformation()); - out.inlinePrint(contmat.getEntropy().normalizedVariationOfInformation()); - out.inlinePrint(contmat.getEdit().f1Measure()); - out.inlinePrint(contmat.getSetMatching().inversePurity()); - out.inlinePrint(contmat.getSetMatching().purity()); - out.inlinePrint(contmat.getSetMatching().f1Measure()); - out.inlinePrint(contmat.getBCubed().precision()); - out.inlinePrint(contmat.getBCubed().recall()); - out.inlinePrint(contmat.getBCubed().f1Measure()); - out.flush(); - } } /** diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java index ef7935d8..cbe8e618 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/PairCounting.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,6 +24,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; */ import de.lmu.ifi.dbs.elki.logging.LoggingUtil; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; /** @@ -54,16 +55,18 @@ public class PairCounting { long inBoth = 0, in1 = 0, in2 = 0, total = 0; // Process first clustering: { - for (int i1 = 0; i1 < table.size1; i1++) { + for(int i1 = 0; i1 < table.size1; i1++) { final int size = table.contingency[i1][table.size2 + 1]; - if (table.breakNoiseClusters && table.noise1.get(i1)) { - if (table.selfPairing) { + if(table.breakNoiseClusters && BitsUtil.get(table.noise1, i1)) { + if(table.selfPairing) { in1 += size; } // else: 0 - } else { - if (table.selfPairing) { + } + else { + if(table.selfPairing) { in1 += size * size; - } else { + } + else { in1 += size * (size - 1); } } @@ -71,33 +74,37 @@ public class PairCounting { } // Process second clustering: { - for (int i2 = 0; i2 < table.size2; i2++) { + for(int i2 = 0; i2 < table.size2; i2++) { final int size = table.contingency[table.size1 + 1][i2]; - if (table.breakNoiseClusters && table.noise2.get(i2)) { - if (table.selfPairing) { + if(table.breakNoiseClusters && BitsUtil.get(table.noise2, i2)) { + if(table.selfPairing) { in2 += size; } // else: 0 - } else { - if (table.selfPairing) { + } + else { + if(table.selfPairing) { in2 += size * size; - } else { + } + else { in2 += size * (size - 1); } } } } // Process combinations - for (int i1 = 0; i1 < table.size1; i1++) { - for (int i2 = 0; i2 < table.size2; i2++) { + for(int i1 = 0; i1 < table.size1; i1++) { + for(int i2 = 0; i2 < table.size2; i2++) { final int size = table.contingency[i1][i2]; - if (table.breakNoiseClusters && (table.noise1.get(i1) || table.noise2.get(i2))) { - if (table.selfPairing) { + if(table.breakNoiseClusters && (BitsUtil.get(table.noise1, i1) || BitsUtil.get(table.noise2, i2))) { + if(table.selfPairing) { inBoth += size; } // else: 0 - } else { - if (table.selfPairing) { + } + else { + if(table.selfPairing) { inBoth += size * size; - } else { + } + else { inBoth += size * (size - 1); } } @@ -105,15 +112,16 @@ public class PairCounting { } // The official sum int tsize = table.contingency[table.size1][table.size2]; - if (table.contingency[table.size1][table.size2 + 1] != tsize || table.contingency[table.size1 + 1][table.size2] != tsize) { + if(table.contingency[table.size1][table.size2 + 1] != tsize || table.contingency[table.size1 + 1][table.size2] != tsize) { LoggingUtil.warning("PairCounting F-Measure is not well defined for overlapping and incomplete clusterings. The number of elements are: " + table.contingency[table.size1][table.size2 + 1] + " != " + table.contingency[table.size1 + 1][table.size2] + " elements."); } - if (tsize < 0 || tsize >= MAX_SIZE) { + if(tsize < 0 || tsize >= MAX_SIZE) { LoggingUtil.warning("Your data set size probably is too big for this implementation, which uses only long precision."); } - if (table.selfPairing) { + if(table.selfPairing) { total = tsize * tsize; - } else { + } + else { total = tsize * (tsize - 1); } long inFirst = in1 - inBoth, inSecond = in2 - inBoth; @@ -172,7 +180,9 @@ public class PairCounting { * @return pair-counting Fowlkes-mallows */ // TODO: implement for non-flat clusterings! - @Reference(authors = "Fowlkes, E.B. and Mallows, C.L.", title = "A method for comparing two hierarchical clusterings", booktitle = "Journal of the American Statistical Association, Vol. 78 Issue 383") + @Reference(authors = "Fowlkes, E.B. and Mallows, C.L.", // + title = "A method for comparing two hierarchical clusterings", // + booktitle = "Journal of the American Statistical Association, Vol. 78 Issue 383") public double fowlkesMallows() { return Math.sqrt(precision() * recall()); } @@ -188,7 +198,10 @@ public class PairCounting { * * @return The Rand index (RI). */ - @Reference(authors = "Rand, W. M.", title = "Objective Criteria for the Evaluation of Clustering Methods", booktitle = "Journal of the American Statistical Association, Vol. 66 Issue 336", url = "http://www.jstor.org/stable/10.2307/2284239") + @Reference(authors = "Rand, W. M.", // + title = "Objective Criteria for the Evaluation of Clustering Methods", // + booktitle = "Journal of the American Statistical Association, Vol. 66 Issue 336", // + url = "http://www.jstor.org/stable/10.2307/2284239") public double randIndex() { final double sum = pairconfuse[0] + pairconfuse[1] + pairconfuse[2] + pairconfuse[3]; return (pairconfuse[0] + pairconfuse[3]) / sum; @@ -203,9 +216,10 @@ public class PairCounting { final double nom = pairconfuse[0] * pairconfuse[3] - pairconfuse[1] * pairconfuse[2]; final long d1 = (pairconfuse[0] + pairconfuse[1]) * (pairconfuse[1] + pairconfuse[3]); final long d2 = (pairconfuse[0] + pairconfuse[2]) * (pairconfuse[2] + pairconfuse[3]); - if (d1 + d2 > 0) { + if(d1 + d2 > 0) { return 2 * nom / (d1 + d2); - } else { + } + else { return 1.; } } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java index 1a003117..d37d4693 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/SetMatchingPurity.java @@ -1,9 +1,10 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering; + /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -41,19 +42,29 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; * University of Washington, Seattle, Technical Report 418, 2002 * </p> * <p> - * Steinbach, M. and Karypis, G. and Kumar, V. and others<br /> + * Steinbach, M. and Karypis, G. and Kumar, V.<br /> * A comparison of document clustering techniques<br /> * KDD workshop on text mining, 2000 * </p> + * <p> + * E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo <br /> + * A comparison of extrinsic clustering evaluation metrics based on formal + * constraints<br /> + * Inf. Retrieval, vol. 12, no. 5, pp. 461–486, 2009 + * </p> * * @author Sascha Goldhofer */ -@Reference(authors = "Meilă, M", title = "Comparing clusterings", booktitle = "University of Washington, Seattle, Technical Report 418, 2002", url = "http://www.stat.washington.edu/mmp/www.stat.washington.edu/mmp/Papers/compare-colt.pdf") +@Reference(authors = "Meilă, M", // +title = "Comparing clusterings", // +booktitle = "University of Washington, Seattle, Technical Report 418, 2002", // +url = "http://www.stat.washington.edu/mmp/Papers/compare-colt.pdf") public class SetMatchingPurity { /** * Result cache */ - protected double smPurity = -1.0, smInversePurity = -1.0, smFFirst = -1.0, smFSecond = - 1.0; + protected double smPurity = -1.0, smInversePurity = -1.0, smFFirst = -1.0, + smFSecond = -1.0; /** * Constructor. @@ -105,14 +116,17 @@ public class SetMatchingPurity { * * @return purity */ - @Reference(authors = "Zhao, Y. and Karypis, G.", title = "Criterion functions for document clustering: Experiments and analysis", booktitle = "University of Minnesota, Department of Computer Science, Technical Report 01-40, 2001", url = "http://www-users.cs.umn.edu/~karypis/publications/Papers/PDF/vscluster.pdf") + @Reference(authors = "Zhao, Y. and Karypis, G.", // + title = "Criterion functions for document clustering: Experiments and analysis", // + booktitle = "University of Minnesota, Department of Computer Science, Technical Report 01-40, 2001", // + url = "http://www-users.cs.umn.edu/~karypis/publications/Papers/PDF/vscluster.pdf") public double purity() { return smPurity; } /** - * Get the set matchings inverse purity (second:first clustering) - * (normalized, 1 = equal) + * Get the set matchings inverse purity (second:first clustering) (normalized, + * 1 = equal) * * @return Inverse purity */ @@ -124,44 +138,57 @@ public class SetMatchingPurity { * Get the set matching F1-Measure * * <p> - * Steinbach, M. and Karypis, G. and Kumar, V. and others<br /> + * Steinbach, M. and Karypis, G. and Kumar, V.<br /> * A comparison of document clustering techniques<br /> * KDD workshop on text mining, 2000 * </p> * * @return Set Matching F1-Measure */ - @Reference(authors = "Steinbach, M. and Karypis, G. and Kumar, V. and others", title = "A comparison of document clustering techniques", booktitle = "KDD workshop on text mining, 2000", url = "http://www-users.itlabs.umn.edu/~karypis/publications/Papers/PDF/doccluster.pdf") + @Reference(authors = "Steinbach, M. and Karypis, G. and Kumar, V.", // + title = "A comparison of document clustering techniques", // + booktitle = "KDD workshop on text mining, 2000", // + url = "http://www-users.itlabs.umn.edu/~karypis/publications/Papers/PDF/doccluster.pdf") public double f1Measure() { return Util.f1Measure(purity(), inversePurity()); } - + /** * Get the Van Rijsbergen’s F measure (asymmetric) for first clustering * * <p> * E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo <br /> - * A comparison of extrinsic clustering evaluation metrics based on formal constraints<br /> + * A comparison of extrinsic clustering evaluation metrics based on formal + * constraints<br /> * Inf. Retrieval, vol. 12, no. 5, pp. 461–486, 2009 * </p> * * @return Set Matching F-Measure of first clustering */ + @Reference(authors = "E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo", // + title = "A comparison of extrinsic clustering evaluation metrics based on formal constraints", // + booktitle = "Inf. Retrieval, vol. 12, no. 5", // + url = "http://dx.doi.org/10.1007/s10791-009-9106-z") public double fMeasureFirst() { return smFFirst; } - + /** * Get the Van Rijsbergen’s F measure (asymmetric) for second clustering * * <p> * E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo <br /> - * A comparison of extrinsic clustering evaluation metrics based on formal constraints<br /> + * A comparison of extrinsic clustering evaluation metrics based on formal + * constraints<br /> * Inf. Retrieval, vol. 12, no. 5, pp. 461–486, 2009 * </p> * * @return Set Matching F-Measure of second clustering */ + @Reference(authors = "E. Amigó, J. Gonzalo, J. Artiles, and F. Verdejo", // + title = "A comparison of extrinsic clustering evaluation metrics based on formal constraints", // + booktitle = "Inf. Retrieval, vol. 12, no. 5", // + url = "http://dx.doi.org/10.1007/s10791-009-9106-z") public double fMeasureSecond() { return smFSecond; } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java new file mode 100644 index 00000000..bc2817a6 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluateSilhouette.java @@ -0,0 +1,265 @@ +package de.lmu.ifi.dbs.elki.evaluation.clustering.internal; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.EvaluationResult; +import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Compute the silhouette of a data set. + * + * Reference: + * <p> + * P. J. Rousseeuw<br /> + * Silhouettes: A graphical aid to the interpretation and validation of cluster + * analysis<br /> + * In: Journal of Computational and Applied Mathematics Volume 20, November 1987 + * </p> + * + * TODO: keep all silhouette values, and allow visualization! + * + * @author Erich Schubert + * + * @param <O> Object type + */ +@Reference(authors = "P. J. Rousseeuw", // +title = "Silhouettes: A graphical aid to the interpretation and validation of cluster analysis", // +booktitle = "Journal of Computational and Applied Mathematics, Volume 20", // +url = "http://dx.doi.org/10.1016%2F0377-0427%2887%2990125-7") +public class EvaluateSilhouette<O> implements Evaluator { + /** + * Logger for debug output. + */ + private static final Logging LOG = Logging.getLogger(EvaluateSilhouette.class); + + /** + * Keep noise "clusters" merged, instead of breaking them into singletons. + */ + private boolean mergenoise = false; + + /** + * Distance function to use. + */ + private DistanceFunction<? super O> distance; + + /** + * Key for logging statistics. + */ + private String key = EvaluateSilhouette.class.getName(); + + /** + * Constructor. + * + * @param distance Distance function + * @param mergenoise Flag to treat noise as clusters, instead of breaking them + * into singletons. + */ + public EvaluateSilhouette(DistanceFunction<? super O> distance, boolean mergenoise) { + super(); + this.distance = distance; + this.mergenoise = mergenoise; + } + + /** + * Evaluate a single clustering. + * + * @param db Database + * @param rel Data relation + * @param dq Distance query + * @param c Clustering + */ + public void evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) { + List<? extends Cluster<?>> clusters = c.getAllClusters(); + MeanVariance msil = new MeanVariance(); + for(Cluster<?> cluster : clusters) { + if(cluster.size() <= 1 || (!mergenoise && cluster.isNoise())) { + // As suggested in Rousseeuw, we use 0 for singletons. + msil.put(0., cluster.size()); + continue; + } + ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs()); + double[] as = new double[ids.size()]; // temporary storage. + DBIDArrayIter it1 = ids.iter(), it2 = ids.iter(); + for(it1.seek(0); it1.valid(); it1.advance()) { + // a: In-cluster distances + double a = as[it1.getOffset()]; // Already computed distances + for(it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) { + final double dist = dq.distance(it1, it2); + a += dist; + as[it2.getOffset()] += dist; + } + a /= (ids.size() - 1); + // b: other clusters: + double min = Double.POSITIVE_INFINITY; + for(Cluster<?> ocluster : clusters) { + if(ocluster == /* yes, reference identity */cluster) { + continue; + } + if(!mergenoise && ocluster.isNoise()) { + // Treat noise cluster as singletons: + for(DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) { + double dist = dq.distance(it1, it3); + if(dist < min) { + min = dist; + } + } + continue; + } + final DBIDs oids = ocluster.getIDs(); + double b = 0.; + for(DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) { + b += dq.distance(it1, it3); + } + b /= oids.size(); + if(b < min) { + min = b; + } + } + msil.put((min - a) / Math.max(min, a)); + } + } + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(key + ".silhouette.noise", mergenoise ? 1L : 0L)); + LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", msil.getMean())); + LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", msil.getSampleStddev())); + } + + EvaluationResult ev = new EvaluationResult("Internal Clustering Evaluation", "internal evaluation"); + MeasurementGroup g = ev.newGroup("Distance-based Evaluation"); + g.addMeasure("Silhouette coefficient +-" + FormatUtil.NF2.format(msil.getSampleStddev()), msil.getMean(), -1., 1., 0., false); + db.getHierarchy().add(c, ev); + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + List<Clustering<?>> crs = ResultUtil.getClusteringResults(result); + if(crs.size() < 1) { + return; + } + Database db = ResultUtil.findDatabase(baseResult); + Relation<O> rel = db.getRelation(distance.getInputTypeRestriction()); + DistanceQuery<O> dq = db.getDistanceQuery(rel, distance); + for(Clustering<?> c : crs) { + evaluateClustering(db, rel, dq, c); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O> extends AbstractParameterizer { + /** + * Parameter for choosing the distance function. + */ + public static final OptionID DISTANCE_ID = new OptionID("silhouette.distance", "Distance function to use for computing the silhouette."); + + /** + * Parameter to treat noise as a single cluster. + */ + public static final OptionID MERGENOISE_ID = new OptionID("silhouette.noisecluster", "Treat noise as a cluster, not as singletons."); + + /** + * Distance function to use. + */ + private DistanceFunction<? super O> distance; + + /** + * Keep noise "clusters" merged. + */ + private boolean mergenoise = false; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<DistanceFunction<? super O>> distP = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + if(config.grab(distP)) { + distance = distP.instantiateClass(config); + } + + Flag noiseP = new Flag(MERGENOISE_ID); + if(config.grab(noiseP)) { + mergenoise = noiseP.isTrue(); + } + } + + @Override + protected EvaluateSilhouette<O> makeInstance() { + return new EvaluateSilhouette<>(distance, mergenoise); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/package-info.java new file mode 100644 index 00000000..952794c8 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/package-info.java @@ -0,0 +1,26 @@ +/** + * Internal evaluation measures for clusterings. + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.evaluation.clustering.internal;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java index 7dc244fd..666de5df 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java index 76508563..dcd972a7 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java index 76769d78..a9e2fbc9 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java index b7215579..ced8d726 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java index 01d0d09a..01d3bfa0 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java b/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java index a9c24172..be06ade2 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/histogram/ComputeOutlierHistogram.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.histogram; This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team @@ -226,12 +226,12 @@ public class ComputeOutlierHistogram implements Evaluator { ids.removeDBIDs(outlierIds);
// fill histogram with values of each object
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- double result = or.getScores().get(iter);
+ double result = or.getScores().doubleValue(iter);
result = scaling.getScaled(result);
hist.putData(result, negative);
}
for(DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
- double result = or.getScores().get(iter);
+ double result = or.getScores().doubleValue(iter);
result = scaling.getScaled(result);
hist.putData(result, positive);
}
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/histogram/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/histogram/package-info.java index a07bcc30..6daba3a0 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/histogram/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/histogram/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexPurity.java b/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexPurity.java index ac0f3457..7b85387e 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexPurity.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexPurity.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.index; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexStatistics.java b/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexStatistics.java index f7c9caa8..1ad74141 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexStatistics.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/index/IndexStatistics.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.index; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/index/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/index/package-info.java index 67fdaa98..2fdf5caf 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/index/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/index/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java index 1639515a..ff34f7c0 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/JudgeOutlierScores.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.outlier; This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team @@ -147,12 +147,12 @@ public class JudgeOutlierScores implements Evaluator { double negscore = 0.0;
// fill histogram with values of each object
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- double result = or.getScores().get(iter);
+ double result = or.getScores().doubleValue(iter);
result = innerScaling.getScaled(scaling.getScaled(result));
posscore += (1.0 - result);
}
for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
- double result = or.getScores().get(iter);
+ double result = or.getScores().doubleValue(iter);
result = innerScaling.getScaled(scaling.getScaled(result));
negscore += result;
}
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java index 72967d58..f876469c 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionAtKCurve.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java index a3d9d92f..8ac9a559 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierPrecisionRecallCurve.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; @@ -116,7 +116,7 @@ public class OutlierPrecisionRecallCurve implements Evaluator { } } - private XYCurve computePrecisionResult(int size, SetDBIDs ids, DBIDIter iter, Relation<Double> scores) { + private XYCurve computePrecisionResult(int size, SetDBIDs ids, DBIDIter iter, DoubleRelation scores) { final int postot = ids.size(); int poscnt = 0, total = 0; XYCurve curve = new PRCurve(postot + 2, postot); @@ -140,7 +140,7 @@ public class OutlierPrecisionRecallCurve implements Evaluator { } // defer calculation for ties if (scores != null) { - double curscore = scores.get(iter); + double curscore = scores.doubleValue(iter); if (Double.compare(prevscore, curscore) == 0) { continue; } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java index 0d92c50a..fc7250c1 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierROCCurve.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,7 +31,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; -import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; +import de.lmu.ifi.dbs.elki.evaluation.scores.ROCEvaluation; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DBIDsTest; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.OutlierScoreAdapter; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.SimpleAdapter; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; @@ -103,30 +106,26 @@ public class OutlierROCCurve implements Evaluator { } private ROCResult computeROCResult(int size, SetDBIDs positiveids, DBIDs order) { - if (order.size() != size) { + if(order.size() != size) { throw new IllegalStateException("Iterable result doesn't match database size - incomplete ordering?"); } - XYCurve roccurve = ROC.materializeROC(new ROC.DBIDsTest(positiveids), new ROC.SimpleAdapter(order.iter())); + XYCurve roccurve = ROCEvaluation.materializeROC(new DBIDsTest(positiveids), new SimpleAdapter(order.iter())); double rocauc = XYCurve.areaUnderCurve(roccurve); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose(ROCAUC_LABEL + ": " + rocauc); } - final ROCResult rocresult = new ROCResult(roccurve, rocauc); - - return rocresult; + return new ROCResult(roccurve, rocauc); } private ROCResult computeROCResult(int size, SetDBIDs positiveids, OutlierResult or) { - XYCurve roccurve = ROC.materializeROC(new ROC.DBIDsTest(positiveids), new ROC.OutlierScoreAdapter(or)); + XYCurve roccurve = ROCEvaluation.materializeROC(new DBIDsTest(positiveids), new OutlierScoreAdapter(or)); double rocauc = XYCurve.areaUnderCurve(roccurve); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose(ROCAUC_LABEL + ": " + rocauc); } - final ROCResult rocresult = new ROCResult(roccurve, rocauc); - - return rocresult; + return new ROCResult(roccurve, rocauc); } @Override @@ -135,7 +134,7 @@ public class OutlierROCCurve implements Evaluator { // Prepare SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); - if (positiveids.size() == 0) { + if(positiveids.size() == 0) { LOG.warning("Computing a ROC curve failed - no objects matched."); return; } @@ -144,7 +143,7 @@ public class OutlierROCCurve implements Evaluator { List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); // Outlier results are the main use case. - for (OutlierResult o : oresults) { + for(OutlierResult o : oresults) { db.getHierarchy().add(o, computeROCResult(o.getScores().size(), positiveids, o)); // Process them only once. orderings.remove(o.getOrdering()); @@ -153,13 +152,13 @@ public class OutlierROCCurve implements Evaluator { // FIXME: find appropriate place to add the derived result // otherwise apply an ordering to the database IDs. - for (OrderingResult or : orderings) { + for(OrderingResult or : orderings) { DBIDs sorted = or.iter(or.getDBIDs()); db.getHierarchy().add(or, computeROCResult(or.getDBIDs().size(), positiveids, sorted)); nonefound = false; } - if (nonefound) { + if(nonefound) { return; // logger.warning("No results found to process with ROC curve analyzer. Got "+iterables.size()+" iterables, "+orderings.size()+" orderings."); } @@ -229,7 +228,7 @@ public class OutlierROCCurve implements Evaluator { protected void makeOptions(Parameterization config) { super.makeOptions(config); PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); - if (config.grab(positiveClassNameP)) { + if(config.grab(positiveClassNameP)) { positiveClassName = positiveClassNameP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierRankingEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierRankingEvaluation.java new file mode 100644 index 00000000..f78a9790 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierRankingEvaluation.java @@ -0,0 +1,211 @@ +package de.lmu.ifi.dbs.elki.evaluation.outlier; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.evaluation.scores.AveragePrecisionEvaluation; +import de.lmu.ifi.dbs.elki.evaluation.scores.MaximumF1Evaluation; +import de.lmu.ifi.dbs.elki.evaluation.scores.PrecisionAtKEvaluation; +import de.lmu.ifi.dbs.elki.evaluation.scores.ROCEvaluation; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DBIDsTest; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.OutlierScoreAdapter; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.SimpleAdapter; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.result.EvaluationResult; +import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.result.OrderingResult; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Evaluate outlier scores by their ranking + * + * @author Erich Schubert + * + * @apiviz.landmark + * + * @apiviz.uses OutlierResult + * @apiviz.has EvaluationResult oneway - - «create» + */ +public class OutlierRankingEvaluation implements Evaluator { + /** + * The logger. + */ + private static final Logging LOG = Logging.getLogger(OutlierRankingEvaluation.class); + + /** + * The pattern to identify positive classes. + * + * <p> + * Key: {@code -rocauc.positive} + * </p> + */ + public static final OptionID POSITIVE_CLASS_NAME_ID = new OptionID("outliereval.positive", "Class label for the 'positive' class."); + + /** + * Stores the "positive" class. + */ + private Pattern positiveClassName; + + /** + * Key prefix for statistics logging. + */ + private String key = OutlierRankingEvaluation.class.getName(); + + /** + * Constructor. + * + * @param positive_class_name Positive class name pattern + */ + public OutlierRankingEvaluation(Pattern positive_class_name) { + super(); + this.positiveClassName = positive_class_name; + } + + private EvaluationResult evaluateOutlierResult(int size, SetDBIDs positiveids, OutlierResult or) { + EvaluationResult res = new EvaluationResult("Evaluation of ranking", "ranking-evaluation"); + DBIDsTest test = new DBIDsTest(positiveids); + + MeasurementGroup g = res.newGroup("Evaluation measures"); + double rocauc = ROCEvaluation.STATIC.evaluate(test, new OutlierScoreAdapter(or)); + g.addMeasure("ROC AUC", rocauc, 0., 1. ,.5, false); + double avep = AveragePrecisionEvaluation.STATIC.evaluate(test, new OutlierScoreAdapter(or)); + g.addMeasure("Average Precision", avep, 0., 1., 0., false); + double rprec = PrecisionAtKEvaluation.RPRECISION.evaluate(test, new OutlierScoreAdapter(or)); + g.addMeasure("R-Precision", rprec, 0., 1., 0., false); + double maxf1 = MaximumF1Evaluation.STATIC.evaluate(test, new OutlierScoreAdapter(or)); + g.addMeasure("Maximum F1", maxf1, 0., 1., 0., false); + if(LOG.isStatistics()) { + LOG.statistics(new DoubleStatistic(key + ".rocauc", rocauc)); + LOG.statistics(new DoubleStatistic(key + ".precision.average", rocauc)); + LOG.statistics(new DoubleStatistic(key + ".precision.r", rocauc)); + LOG.statistics(new DoubleStatistic(key + ".f1.maximum", rocauc)); + } + return res; + } + + private EvaluationResult evaluateOrderingResult(int size, SetDBIDs positiveids, DBIDs order) { + if(order.size() != size) { + throw new IllegalStateException("Iterable result doesn't match database size - incomplete ordering?"); + } + + EvaluationResult res = new EvaluationResult("Evaluation of ranking", "ranking-evaluation"); + DBIDsTest test = new DBIDsTest(positiveids); + + MeasurementGroup g = res.newGroup("Evaluation measures"); + double rocauc = ROCEvaluation.STATIC.evaluate(test, new SimpleAdapter(order.iter())); + g.addMeasure("ROC AUC", rocauc, 0., 1. ,.5, false); + double avep = AveragePrecisionEvaluation.STATIC.evaluate(test, new SimpleAdapter(order.iter())); + g.addMeasure("Average Precision", avep, 0., 1., 0., false); + double rprec = PrecisionAtKEvaluation.RPRECISION.evaluate(test, new SimpleAdapter(order.iter())); + g.addMeasure("R-Precision", rprec, 0., 1., 0., false); + double maxf1 = MaximumF1Evaluation.STATIC.evaluate(test, new SimpleAdapter(order.iter())); + g.addMeasure("Maximum F1", maxf1, 0., 1., 0., false); + if(LOG.isStatistics()) { + LOG.statistics(new DoubleStatistic(key + ".rocauc", rocauc)); + LOG.statistics(new DoubleStatistic(key + ".precision.average", rocauc)); + LOG.statistics(new DoubleStatistic(key + ".precision.r", rocauc)); + LOG.statistics(new DoubleStatistic(key + ".f1.maximum", rocauc)); + } + return res; + } + + @Override + public void processNewResult(HierarchicalResult baseResult, Result result) { + Database db = ResultUtil.findDatabase(baseResult); + SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName)); + + if(positiveids.size() == 0) { + LOG.warning("Cannot evaluate outlier results - no objects matched the given pattern."); + return; + } + + boolean nonefound = true; + List<OutlierResult> oresults = ResultUtil.getOutlierResults(result); + List<OrderingResult> orderings = ResultUtil.getOrderingResults(result); + // Outlier results are the main use case. + for(OutlierResult o : oresults) { + db.getHierarchy().add(o, evaluateOutlierResult(o.getScores().size(), positiveids, o)); + // Process them only once. + orderings.remove(o.getOrdering()); + nonefound = false; + } + + // FIXME: find appropriate place to add the derived result + // otherwise apply an ordering to the database IDs. + for(OrderingResult or : orderings) { + DBIDs sorted = or.iter(or.getDBIDs()); + db.getHierarchy().add(or, evaluateOrderingResult(or.getDBIDs().size(), positiveids, sorted)); + nonefound = false; + } + + if(nonefound) { + return; + // logger.warning("No results found to process with ROC curve analyzer. Got "+iterables.size()+" iterables, "+orderings.size()+" orderings."); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Pattern for positive class. + */ + protected Pattern positiveClassName = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); + if(config.grab(positiveClassNameP)) { + positiveClassName = positiveClassNameP.getValue(); + } + } + + @Override + protected OutlierRankingEvaluation makeInstance() { + return new OutlierRankingEvaluation(positiveClassName); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java index 71ec42f5..3400a4bf 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierSmROCCurve.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; @@ -60,20 +60,21 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; * However, this method has some deficiencies when the mean score is not 0.5, as * discussed in: * <p> - * E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br> - * On Evaluation of Outlier Rankings and Outlier Scores<br> + * E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br /> + * On Evaluation of Outlier Rankings and Outlier Scores<br /> * In Proceedings of the 12th SIAM International Conference on Data Mining * (SDM), Anaheim, CA, 2012. * </p> * * @author Erich Schubert * - * @apiviz.landmark - * * @apiviz.uses OutlierResult * @apiviz.has SmROCResult oneway - - «create» */ -@Reference(authors = "W. Klement, P. A. Flach, N. Japkowicz, S. Matwin", title = "Smooth Receiver Operating Characteristics (smROC) Curves", booktitle = "In: European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD'11)", url = "http://dx.doi.org/10.1007/978-3-642-23783-6_13") +@Reference(authors = "W. Klement, P. A. Flach, N. Japkowicz, S. Matwin", // +title = "Smooth Receiver Operating Characteristics (smROC) Curves", // +booktitle = "In: European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD'11)", // +url = "http://dx.doi.org/10.1007/978-3-642-23783-6_13") public class OutlierSmROCCurve implements Evaluator { /** * The label we use for marking ROCAUC values. @@ -101,13 +102,13 @@ public class OutlierSmROCCurve implements Evaluator { } private SmROCResult computeSmROCResult(SetDBIDs positiveids, OutlierResult or) { - Relation<Double> scores = or.getScores(); + DoubleRelation scores = or.getScores(); final int size = scores.size(); // Compute mean, for inversion double mean = 0.0; for(DBIDIter iditer = scores.iterDBIDs(); iditer.valid(); iditer.advance()) { - mean += scores.get(iditer) / size; + mean += scores.doubleValue(iditer) / size; } SmROCResult curve = new SmROCResult(positiveids.size() + 2); @@ -118,9 +119,9 @@ public class OutlierSmROCCurve implements Evaluator { int poscnt = 0, negcnt = 0; double prevscore = Double.NaN; double x = 0, y = 0; - for (DBIDIter nei = or.getOrdering().iter(or.getOrdering().getDBIDs()).iter(); nei.valid(); nei.advance()) { + for(DBIDIter nei = or.getOrdering().iter(or.getOrdering().getDBIDs()).iter(); nei.valid(); nei.advance()) { // Analyze next point - final double curscore = scores.get(nei); + final double curscore = scores.doubleValue(nei); // defer calculation for ties if(!Double.isNaN(prevscore) && (Double.compare(prevscore, curscore) == 0)) { // positive or negative match? diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java index 04233378..5591df3f 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/OutlierThresholdClustering.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.evaluation.outlier; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; import de.lmu.ifi.dbs.elki.result.Result; @@ -88,7 +88,7 @@ public class OutlierThresholdClustering implements Evaluator { } private Clustering<Model> split(OutlierResult or) { - Relation<Double> scores = or.getScores(); + DoubleRelation scores = or.getScores(); if(scaling instanceof OutlierScalingFunction) { ((OutlierScalingFunction) scaling).prepare(or); } @@ -97,7 +97,7 @@ public class OutlierThresholdClustering implements Evaluator { idlists.add(DBIDUtil.newHashSet()); } for(DBIDIter iter = scores.getDBIDs().iter(); iter.valid(); iter.advance()) { - double score = scores.get(iter); + double score = scores.doubleValue(iter); if(scaling != null) { score = scaling.getScaled(score); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/package-info.java index 1e5e6933..81cf2248 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/outlier/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/outlier/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/package-info.java index 8b8a96dc..54755c75 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java b/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java deleted file mode 100644 index 0ca42b29..00000000 --- a/src/de/lmu/ifi/dbs/elki/evaluation/roc/ROC.java +++ /dev/null @@ -1,656 +0,0 @@ -package de.lmu.ifi.dbs.elki.evaluation.roc; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2013 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -import de.lmu.ifi.dbs.elki.data.Cluster; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; -import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; -import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; -import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.ArrayIter; -import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.Iter; - -/** - * Compute ROC (Receiver Operating Characteristics) curves. - * - * A ROC curve compares the true positive rate (y-axis) and false positive rate - * (x-axis). - * - * It was first used in radio signal detection, but has since found widespread - * use in information retrieval, in particular for evaluating binary - * classification problems. - * - * ROC curves are particularly useful to evaluate a ranking of objects with - * respect to a binary classification problem: a random sampling will - * approximately achieve a ROC value of 0.5, while a perfect separation will - * achieve 1.0 (all positives first) or 0.0 (all negatives first). In most use - * cases, a score significantly below 0.5 indicates that the algorithm result - * has been used the wrong way, and should be used backwards. - * - * @author Erich Schubert - * - * @apiviz.composedOf ScoreIter - * @apiviz.composedOf Predicate - * @apiviz.has XYCurve - */ -public class ROC { - /** - * Iterator for comparing scores. - * - * @author Erich Schubert - */ - public static interface ScoreIter extends Iter { - /** - * Test whether the score is the same as the previous objects score. - * - * When there is no previous result, implementations should return false! - * - * @return Boolean - */ - boolean tiedToPrevious(); - } - - /** - * Predicate to test whether an object is a true positive or false positive. - * - * @author Erich Schubert - * - * @param <T> Data type - */ - public static interface Predicate<T> { - boolean test(T o); - } - - /** - * Compute a ROC curve given a set of positive IDs and a sorted list of - * (comparable, ID)s, where the comparable object is used to decided when two - * objects are interchangeable. - * - * @param <I> Iterator type - * @param predicate Predicate to test for positive objects - * @param iter Iterator over results, with ties. - * @return area under curve - */ - public static <I extends ScoreIter> XYCurve materializeROC(Predicate<? super I> predicate, I iter) { - int poscnt = 0, negcnt = 0; - XYCurve curve = new XYCurve("False Positive Rate", "True Positive Rate"); - - // start in bottom left - curve.add(0.0, 0.0); - - while (iter.valid()) { - // positive or negative match? - do { - if (predicate.test(iter)) { - ++poscnt; - } else { - ++negcnt; - } - iter.advance(); - } // Loop while tied: - while (iter.valid() && iter.tiedToPrevious()); - // Add a new point. - curve.addAndSimplify(negcnt, poscnt); - } - // Ensure we end up in the top right corner. - // Simplification will skip this if we already were. - curve.addAndSimplify(negcnt, poscnt); - curve.rescale(1. / negcnt, 1. / poscnt); - return curve; - } - - /** - * This adapter can be used for an arbitrary collection of Integers, and uses - * that id1.compareTo(id2) != 0 for id1 != id2 to satisfy the comparability. - * - * Note that of course, no id should occur more than once. - * - * The ROC values would be incorrect then anyway! - * - * @author Erich Schubert - * - * @apiviz.composedOf DBIDIter - */ - public static class SimpleAdapter implements ScoreIter, DBIDRef { - /** - * Original Iterator - */ - private DBIDIter iter; - - /** - * Constructor - * - * @param iter Iterator for object IDs - */ - public SimpleAdapter(DBIDIter iter) { - super(); - this.iter = iter; - } - - @Override - public boolean valid() { - return iter.valid(); - } - - @Override - public void advance() { - iter.advance(); - } - - @Override - public boolean tiedToPrevious() { - return false; // No information. - } - - @Override - public int internalGetIndex() { - return iter.internalGetIndex(); - } - - @Deprecated - @Override - public int hashCode() { - return super.hashCode(); - } - - @Deprecated - @Override - public boolean equals(Object obj) { - return super.equals(obj); - } - } - - /** - * This adapter can be used for an arbitrary collection of Integers, and uses - * that id1.compareTo(id2) != 0 for id1 != id2 to satisfy the comparability. - * - * Note that of course, no id should occur more than once. - * - * The ROC values would be incorrect then anyway! - * - * @author Erich Schubert - * - * @apiviz.composedOf DistanceDBIDListIter - * - * @param <D> Distance type - */ - public static class DistanceResultAdapter<D extends Distance<D>> implements ScoreIter, DBIDRef { - /** - * Original Iterator - */ - private DistanceDBIDListIter<D> iter; - - /** - * Distance of previous. - */ - private D prevDist = null; - - /** - * Constructor - * - * @param iter Iterator for distance results - */ - public DistanceResultAdapter(DistanceDBIDListIter<D> iter) { - super(); - this.iter = iter; - } - - @Override - public boolean valid() { - return iter.valid(); - } - - @Override - public void advance() { - prevDist = iter.getDistance(); - iter.advance(); - } - - @Override - public int internalGetIndex() { - return iter.internalGetIndex(); - } - - @Override - public boolean tiedToPrevious() { - return iter.getDistance().equals(prevDist); - } - - @Deprecated - @Override - public int hashCode() { - return super.hashCode(); - } - - @Deprecated - @Override - public boolean equals(Object obj) { - return super.equals(obj); - } - } - - /** - * This adapter can be used for an arbitrary collection of Integers, and uses - * that id1.compareTo(id2) != 0 for id1 != id2 to satisfy the comparability. - * - * Note that of course, no id should occur more than once. - * - * The ROC values would be incorrect then anyway! - * - * @author Erich Schubert - * - * @apiviz.composedOf OutlierResult - */ - public static class OutlierScoreAdapter implements ScoreIter, DBIDRef { - /** - * Original iterator. - */ - private DBIDIter iter; - - /** - * Outlier score. - */ - private Relation<Double> scores; - - /** - * Previous value. - */ - double prev = Double.NaN; - - /** - * Constructor. - * - * @param o Result - */ - public OutlierScoreAdapter(OutlierResult o) { - super(); - this.iter = o.getOrdering().iter(o.getScores().getDBIDs()).iter(); - this.scores = o.getScores(); - } - - @Override - public boolean valid() { - return iter.valid(); - } - - @Override - public void advance() { - prev = scores.get(iter); - iter.advance(); - } - - @Override - public boolean tiedToPrevious() { - return scores.get(iter) == prev; - } - - @Override - public int internalGetIndex() { - return iter.internalGetIndex(); - } - - @Deprecated - @Override - public int hashCode() { - return super.hashCode(); - } - - @Deprecated - @Override - public boolean equals(Object obj) { - return super.equals(obj); - } - } - - /** - * Class to iterate over a number vector in decreasing order. - * - * @author Erich Schubert - * - * @apiviz.composedOf NumberVector - */ - public static class DecreasingVectorIter implements ScoreIter, IntegerComparator, ArrayIter { - /** - * Order of dimensions. - */ - private int[] sort; - - /** - * Data vector. - */ - private NumberVector<?> vec; - - /** - * Current position. - */ - int pos = 0; - - /** - * Constructor. - * - * @param vec Vector to iterate over. - */ - public DecreasingVectorIter(NumberVector<?> vec) { - this.vec = vec; - final int dim = vec.getDimensionality(); - this.sort = new int[dim]; - for (int d = 0; d < dim; d++) { - sort[d] = d; - } - IntegerArrayQuickSort.sort(sort, this); - } - - @Override - public int compare(int x, int y) { - return Double.compare(vec.doubleValue(y), vec.doubleValue(x)); - } - - public int dim() { - return sort[pos]; - } - - @Override - public boolean valid() { - return pos < vec.getDimensionality(); - } - - @Override - public void advance() { - ++pos; - } - - @Override - public boolean tiedToPrevious() { - return pos > 0 && Double.compare(vec.doubleValue(sort[pos]), vec.doubleValue(sort[pos - 1])) == 0; - } - - @Override - public int getOffset() { - return pos; - } - - @Override - public void advance(int count) { - pos += count; - } - - @Override - public void retract() { - pos--; - } - - @Override - public void seek(int off) { - pos = off; - } - } - - /** - * Class to iterate over a number vector in decreasing order. - * - * @author Erich Schubert - * - * @apiviz.composedOf NumberVector - */ - public static class IncreasingVectorIter implements ScoreIter, IntegerComparator, ArrayIter { - /** - * Order of dimensions. - */ - private int[] sort; - - /** - * Data vector. - */ - private NumberVector<?> vec; - - /** - * Current position. - */ - int pos = 0; - - /** - * Constructor. - * - * @param vec Vector to iterate over. - */ - public IncreasingVectorIter(NumberVector<?> vec) { - this.vec = vec; - final int dim = vec.getDimensionality(); - this.sort = new int[dim]; - for (int d = 0; d < dim; d++) { - sort[d] = d; - } - IntegerArrayQuickSort.sort(sort, this); - } - - @Override - public int compare(int x, int y) { - return Double.compare(vec.doubleValue(x), vec.doubleValue(y)); - } - - public int dim() { - return sort[pos]; - } - - @Override - public boolean valid() { - return pos < vec.getDimensionality(); - } - - @Override - public void advance() { - ++pos; - } - - @Override - public boolean tiedToPrevious() { - return pos > 0 && Double.compare(vec.doubleValue(sort[pos]), vec.doubleValue(sort[pos - 1])) == 0; - } - - @Override - public int getOffset() { - return pos; - } - - @Override - public void advance(int count) { - pos += count; - } - - @Override - public void retract() { - pos--; - } - - @Override - public void seek(int off) { - pos = off; - } - } - - /** - * Class that uses a NumberVector as reference, and considers all non-zero - * values as positive entries. - * - * @apiviz.composedOf NumberVector - * - * @author Erich Schubert - */ - public static class VectorNonZero extends VectorOverThreshold { - /** - * Constructor. - * - * @param vec Reference vector. - */ - public VectorNonZero(NumberVector<?> vec) { - super(vec, 0.); - } - } - - /** - * Class that uses a NumberVector as reference, and considers all non-zero - * values as positive entries. - * - * @apiviz.composedOf NumberVector - * - * @author Erich Schubert - */ - public static class VectorOverThreshold implements Predicate<DecreasingVectorIter> { - /** - * Vector to use as reference - */ - NumberVector<?> vec; - - /** - * Threshold - */ - double threshold; - - /** - * Constructor. - * - * @param vec Reference vector. - * @param threshold Threshold value. - */ - public VectorOverThreshold(NumberVector<?> vec, double threshold) { - super(); - this.vec = vec; - this.threshold = threshold; - } - - @Override - public boolean test(DecreasingVectorIter o) { - return Math.abs(vec.doubleValue(o.dim())) > threshold; - } - } - - /** - * Class that uses a NumberVector as reference, and considers all zero values - * as positive entries. - * - * @apiviz.composedOf NumberVector - * - * @author Erich Schubert - */ - public static class VectorZero implements Predicate<IncreasingVectorIter> { - /** - * Vector to use as reference - */ - NumberVector<?> vec; - - /** - * Constructor. - * - * @param vec Reference vector. - */ - public VectorZero(NumberVector<?> vec) { - this.vec = vec; - } - - @Override - public boolean test(IncreasingVectorIter o) { - return Math.abs(vec.doubleValue(o.dim())) < Double.MIN_NORMAL; - } - } - - /** - * Test predicate using a DBID set as positive elements. - * - * @apiviz.composedOf DBIDs - * - * @author Erich Schubert - */ - public static class DBIDsTest implements Predicate<DBIDRef> { - /** - * DBID set. - */ - private DBIDs set; - - /** - * Constructor. - * - * @param set Set of positive objects - */ - public DBIDsTest(DBIDs set) { - this.set = set; - } - - @Override - public boolean test(DBIDRef o) { - return set.contains(o); - } - } - - /** - * Compute a ROC curves Area-under-curve for a QueryResult and a Cluster. - * - * @param <D> Distance type - * @param size Database size - * @param clus Cluster object - * @param nei Query result - * @return area under curve - */ - public static <D extends Distance<D>> double computeROCAUCDistanceResult(int size, Cluster<?> clus, DistanceDBIDList<D> nei) { - // TODO: ensure the collection has efficient "contains". - return ROC.computeROCAUCDistanceResult(size, clus.getIDs(), nei); - } - - /** - * Compute a ROC curves Area-under-curve for a QueryResult and a Cluster. - * - * @param <D> Distance type - * @param size Database size - * @param ids Collection of positive IDs, should support efficient contains() - * @param nei Query Result - * @return area under curve - */ - public static <D extends Distance<D>> double computeROCAUCDistanceResult(int size, DBIDs ids, DistanceDBIDList<D> nei) { - // TODO: do not materialize the ROC, but introduce an iterator interface - XYCurve roc = materializeROC(new DBIDsTest(DBIDUtil.ensureSet(ids)), new DistanceResultAdapter<>(nei.iter())); - return XYCurve.areaUnderCurve(roc); - } - - /** - * Compute a ROC curves Area-under-curve for a QueryResult and a Cluster. - * - * @param size Database size - * @param ids Collection of positive IDs, should support efficient contains() - * @param nei Query Result - * @return area under curve - */ - public static double computeROCAUCSimple(int size, DBIDs ids, DBIDs nei) { - // TODO: do not materialize the ROC, but introduce an iterator interface - XYCurve roc = materializeROC(new DBIDsTest(DBIDUtil.ensureSet(ids)), new SimpleAdapter(nei.iter())); - return XYCurve.areaUnderCurve(roc); - } -} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/roc/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/roc/package-info.java deleted file mode 100644 index c6934ba2..00000000 --- a/src/de/lmu/ifi/dbs/elki/evaluation/roc/package-info.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * <p>Evaluation of rankings using ROC AUC (Receiver Operation Characteristics - Area Under Curve)</p> - * - * @apiviz.exclude java.util.* - */ -/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2013 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ -package de.lmu.ifi.dbs.elki.evaluation.roc;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/AbstractScoreEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/AbstractScoreEvaluation.java new file mode 100644 index 00000000..43de7ebd --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/AbstractScoreEvaluation.java @@ -0,0 +1,55 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DBIDsTest; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DistanceResultAdapter; +import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.OutlierScoreAdapter; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; + +/** + * Abstract base class for evaluating a scoring result. + * + * @author Erich Schubert + */ +public abstract class AbstractScoreEvaluation implements ScoreEvaluation { + @Override + public double evaluate(Cluster<?> clus, DoubleDBIDList nei) { + return evaluate(new DBIDsTest(DBIDUtil.ensureSet(clus.getIDs())), new DistanceResultAdapter(nei.iter())); + } + + @Override + public double evaluate(DBIDs ids, DoubleDBIDList nei) { + return evaluate(new DBIDsTest(DBIDUtil.ensureSet(ids)), new DistanceResultAdapter(nei.iter())); + } + + @Override + public double evaluate(DBIDs ids, OutlierResult outlier) { + return evaluate(new DBIDsTest(DBIDUtil.ensureSet(ids)), new OutlierScoreAdapter(outlier)); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/AveragePrecisionEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/AveragePrecisionEvaluation.java new file mode 100644 index 00000000..bc3d1522 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/AveragePrecisionEvaluation.java @@ -0,0 +1,76 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Evaluate using average precision. + * + * @author Erich Schubert + */ +public class AveragePrecisionEvaluation extends AbstractScoreEvaluation { + /** + * Static instance + */ + public static final AveragePrecisionEvaluation STATIC = new AveragePrecisionEvaluation(); + + @Override + public <I extends ScoreIter> double evaluate(Predicate<? super I> predicate, I iter) { + int poscnt = 0, negcnt = 0, pospre = 0; + double acc = 0.; + while(iter.valid()) { + // positive or negative match? + do { + if(predicate.test(iter)) { + ++poscnt; + } + else { + ++negcnt; + } + iter.advance(); + } // Loop while tied: + while(iter.valid() && iter.tiedToPrevious()); + // Add a new point. + if(poscnt > pospre) { + acc += (poscnt / (double) (poscnt + negcnt)) * (poscnt - pospre); + pospre = poscnt; + } + } + return (poscnt > 0) ? acc / poscnt : 0.; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected AveragePrecisionEvaluation makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/MaximumF1Evaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/MaximumF1Evaluation.java new file mode 100644 index 00000000..bac59b45 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/MaximumF1Evaluation.java @@ -0,0 +1,76 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Evaluate using the maximum F1 score. + * + * @author Erich Schubert + */ +public class MaximumF1Evaluation extends AbstractScoreEvaluation { + /** + * Static instance + */ + public static final MaximumF1Evaluation STATIC = new MaximumF1Evaluation(); + + @Override + public <I extends ScoreIter> double evaluate(Predicate<? super I> predicate, I iter) { + final int postot = predicate.numPositive(); + int poscnt = 0, cnt = 0; + double maxf1 = 0.; + while(iter.valid()) { + // positive or negative match? + do { + if(predicate.test(iter)) { + ++poscnt; + } + ++cnt; + iter.advance(); + } // Loop while tied: + while(iter.valid() && iter.tiedToPrevious()); + // New F1 value: + double p = poscnt / (double) cnt, r = poscnt / (double) postot; + double f1 = 2. * p * r / (p + r); + if(f1 > maxf1) { + maxf1 = f1; + } + } + return maxf1; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected MaximumF1Evaluation makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/PrecisionAtKEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/PrecisionAtKEvaluation.java new file mode 100644 index 00000000..97ab0abd --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/PrecisionAtKEvaluation.java @@ -0,0 +1,127 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * Evaluate using Precision@k, or R-precision (when {@code k=0}). + * + * When {@code k=0}, then it is set to the number of positive objects, and the + * returned value is the R-precision, or the precision-recall break-even-point + * (BEP). + * + * @author Erich Schubert + */ +public class PrecisionAtKEvaluation extends AbstractScoreEvaluation { + /** + * Static instance + */ + public static final PrecisionAtKEvaluation RPRECISION = new PrecisionAtKEvaluation(0); + + /** + * Parameter k. + */ + int k; + + /** + * Constructor. + * + * @param k k to evaluate at. May be 0. + */ + public PrecisionAtKEvaluation(int k) { + this.k = k; + } + + @Override + public <I extends ScoreIter> double evaluate(Predicate<? super I> predicate, I iter) { + final int k = (this.k > 0) ? this.k : predicate.numPositive(); + int total = 0; + double score = 0.; + while(iter.valid() && total < k) { + int posthis = 0, cntthis = 0; + // positive or negative match? + do { + if(predicate.test(iter)) { + ++posthis; + } + ++cntthis; + iter.advance(); + } // Loop while tied: + while(iter.valid() && iter.tiedToPrevious()); + // Special tie computations only when we reach k. + if(total + cntthis > k) { + // p = posthis / cntthis chance of being positive + // n = (k-total) draws. + // expected value = n * p + score += posthis / (double) cntthis * (k - total); + total = k; + break; + } + score += posthis; + total += cntthis; + } + return score / total; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Option ID for the k parameter. + */ + public static final OptionID K_ID = new OptionID("precision.k", // + "k value for precision@k. Can be set to 0, to get R-precision, or the precision-recall-break-even-point."); + + /** + * K parameter + */ + int k; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + IntParameter kP = new IntParameter(K_ID) // + .setDefaultValue(0) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(kP)) { + k = kP.intValue(); + } + } + + @Override + protected PrecisionAtKEvaluation makeInstance() { + return k > 0 ? new PrecisionAtKEvaluation(k) : RPRECISION; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/ROCEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/ROCEvaluation.java new file mode 100644 index 00000000..1a74d25a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/ROCEvaluation.java @@ -0,0 +1,147 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.math.geometry.XYCurve; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Compute ROC (Receiver Operating Characteristics) curves. + * + * A ROC curve compares the true positive rate (y-axis) and false positive rate + * (x-axis). + * + * It was first used in radio signal detection, but has since found widespread + * use in information retrieval, in particular for evaluating binary + * classification problems. + * + * ROC curves are particularly useful to evaluate a ranking of objects with + * respect to a binary classification problem: a random sampling will + * approximately achieve a ROC value of 0.5, while a perfect separation will + * achieve 1.0 (all positives first) or 0.0 (all negatives first). In most use + * cases, a score significantly below 0.5 indicates that the algorithm result + * has been used the wrong way, and should be used backwards. + * + * @author Erich Schubert + * + * @apiviz.has XYCurve + */ +public class ROCEvaluation extends AbstractScoreEvaluation { + /** + * Static instance + */ + public static final ROCEvaluation STATIC = new ROCEvaluation(); + + @Override + public <I extends ScoreIter> double evaluate(Predicate<? super I> predicate, I iter) { + return computeROCAUC(predicate, iter); + } + + /** + * Compute a ROC curve given a set of positive IDs and a sorted list of + * (comparable, ID)s, where the comparable object is used to decided when two + * objects are interchangeable. + * + * @param <I> Iterator type + * @param predicate Predicate to test for positive objects + * @param iter Iterator over results, with ties. + * @return area under curve + */ + public static <I extends ScoreIter> XYCurve materializeROC(Predicate<? super I> predicate, I iter) { + int poscnt = 0, negcnt = 0; + XYCurve curve = new XYCurve("False Positive Rate", "True Positive Rate"); + + // start in bottom left + curve.add(0.0, 0.0); + + while(iter.valid()) { + // positive or negative match? + do { + if(predicate.test(iter)) { + ++poscnt; + } + else { + ++negcnt; + } + iter.advance(); + } // Loop while tied: + while(iter.valid() && iter.tiedToPrevious()); + // Add a new point. + curve.addAndSimplify(negcnt, poscnt); + } + // Ensure we end up in the top right corner. + // Simplification will skip this if we already were. + curve.addAndSimplify(negcnt, poscnt); + curve.rescale(1. / negcnt, 1. / poscnt); + return curve; + } + + /** + * Compute the area under the ROC curve given a set of positive IDs and a + * sorted list of (comparable, ID)s, where the comparable object is used to + * decided when two objects are interchangeable. + * + * @param <I> Iterator type + * @param predicate Predicate to test for positive objects + * @param iter Iterator over results, with ties. + * @return area under curve + */ + public static <I extends ScoreIter> double computeROCAUC(Predicate<? super I> predicate, I iter) { + int poscnt = 0, negcnt = 0, pospre = 0, negpre = 0; + double acc = 0.; + while(iter.valid()) { + // positive or negative match? + do { + if(predicate.test(iter)) { + ++poscnt; + } + else { + ++negcnt; + } + iter.advance(); + } // Loop while tied: + while(iter.valid() && iter.tiedToPrevious()); + if(negcnt > negpre) { + acc += (poscnt + pospre) * .5 * (negcnt - negpre); + negpre = negcnt; + } + pospre = poscnt; + } + acc /= negcnt * (long) poscnt; + return acc == acc ? acc : 0.5; /* Detect NaN */ + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected ROCEvaluation makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/ScoreEvaluation.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/ScoreEvaluation.java new file mode 100644 index 00000000..d32714ad --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/ScoreEvaluation.java @@ -0,0 +1,114 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.Iter; + +/** + * Compute ranking/scoring based evaluation measures. + * + * @author Erich Schubert + */ +public interface ScoreEvaluation { + /** + * Evaluate a given predicate and iterator. + * + * @param predicate Predicate (for positives) + * @param iter Iterator + * @return Score + * @param <I> Iterator type + */ + <I extends ScoreIter> double evaluate(Predicate<? super I> predicate, I iter); + + /** + * Evaluate given a cluster (of positive elements) and a scoring list. + * + * @param clus Cluster object + * @param nei Query result + * @return Score + */ + double evaluate(Cluster<?> clus, DoubleDBIDList nei); + + /** + * Evaluate given a list of positives and a scoring. + * + * @param ids Positive IDs, usually a set. + * @param nei Query Result + * @return Score + */ + double evaluate(DBIDs ids, DoubleDBIDList nei); + + /** + * Evaluate given a set of positives and a scoring. + * + * @param ids Positive IDs, usually a set. + * @param outlier Outlier detection result + * @return Score + */ + double evaluate(DBIDs ids, OutlierResult outlier); + + /** + * Iterator for comparing scores. + * + * @author Erich Schubert + */ + public static interface ScoreIter extends Iter { + /** + * Test whether the score is the same as the previous objects score. + * + * When there is no previous result, implementations should return false! + * + * @return Boolean + */ + boolean tiedToPrevious(); + } + + /** + * Predicate to test whether an object is a true positive or false positive. + * + * @author Erich Schubert + * + * @param <T> Data type + */ + public static interface Predicate<T> { + /** + * Test a result. + * + * @param o Object to test + * @return {@code true} when positive. + */ + boolean test(T o); + + /** + * Return the number of positive ids. + * + * @return Number of positive elements + */ + int numPositive(); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DBIDRefIter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DBIDRefIter.java new file mode 100644 index 00000000..2fdb5978 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DBIDRefIter.java @@ -0,0 +1,39 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * A score iterator wrapping a DBIDRef object. + * + * @author Erich Schubert + */ +public interface DBIDRefIter { + /** + * Get the current DBIDRef. + * + * @return DBID reference + */ + DBIDRef getRef(); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DBIDsTest.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DBIDsTest.java new file mode 100644 index 00000000..d41c7af4 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DBIDsTest.java @@ -0,0 +1,59 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.Predicate; + +/** + * Test predicate using a DBID set as positive elements. + * + * @apiviz.composedOf DBIDs + * + * @author Erich Schubert + */ +public class DBIDsTest implements Predicate<DBIDRefIter> { + /** + * DBID set. + */ + private DBIDs set; + + /** + * Constructor. + * + * @param set Set of positive objects + */ + public DBIDsTest(DBIDs set) { + this.set = set; + } + + @Override + public boolean test(DBIDRefIter o) { + return set.contains(o.getRef()); + } + + @Override + public int numPositive() { + return set.size(); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DecreasingVectorIter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DecreasingVectorIter.java new file mode 100644 index 00000000..d0a55b19 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DecreasingVectorIter.java @@ -0,0 +1,116 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.ScoreIter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; +import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.ArrayIter; + +/** + * Class to iterate over a number vector in decreasing order. + * + * @author Erich Schubert + * + * @apiviz.composedOf NumberVector + */ +public class DecreasingVectorIter implements ScoreIter, IntegerComparator, ArrayIter { + /** + * Order of dimensions. + */ + private int[] sort; + + /** + * Data vector. + */ + private NumberVector vec; + + /** + * Current position. + */ + int pos = 0; + + /** + * Constructor. + * + * @param vec Vector to iterate over. + */ + public DecreasingVectorIter(NumberVector vec) { + this.vec = vec; + final int dim = vec.getDimensionality(); + this.sort = new int[dim]; + for(int d = 0; d < dim; d++) { + sort[d] = d; + } + IntegerArrayQuickSort.sort(sort, this); + } + + @Override + public int compare(int x, int y) { + return Double.compare(vec.doubleValue(y), vec.doubleValue(x)); + } + + public int dim() { + return sort[pos]; + } + + @Override + public boolean valid() { + return pos < vec.getDimensionality() && pos >= 0; + } + + @Override + public DecreasingVectorIter advance() { + ++pos; + return this; + } + + @Override + public boolean tiedToPrevious() { + return pos > 0 && Double.compare(vec.doubleValue(sort[pos]), vec.doubleValue(sort[pos - 1])) == 0; + } + + @Override + public int getOffset() { + return pos; + } + + @Override + public DecreasingVectorIter advance(int count) { + pos += count; + return this; + } + + @Override + public DecreasingVectorIter retract() { + pos--; + return this; + } + + @Override + public DecreasingVectorIter seek(int off) { + pos = off; + return this; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DistanceResultAdapter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DistanceResultAdapter.java new file mode 100644 index 00000000..704c38bc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/DistanceResultAdapter.java @@ -0,0 +1,92 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.ScoreIter; + +/** + * This adapter is used to process a list of (double, DBID) objects. The list + * <em>must</em> be sorted appropriately, the score is only used to detect + * ties. + * + * @author Erich Schubert + * + * @apiviz.composedOf DoubleDBIDListIter + */ +public class DistanceResultAdapter implements ScoreIter, DBIDRefIter { + /** + * Original Iterator + */ + protected DoubleDBIDListIter iter; + + /** + * Distance of previous. + */ + protected double prevDist = Double.NaN; + + /** + * Constructor + * + * @param iter Iterator for distance results + */ + public DistanceResultAdapter(DoubleDBIDListIter iter) { + super(); + this.iter = iter; + } + + @Override + public boolean valid() { + return iter.valid(); + } + + @Override + public DistanceResultAdapter advance() { + prevDist = iter.doubleValue(); + iter.advance(); + return this; + } + + @Override + public DBIDRef getRef() { + return iter; + } + + @Override + public boolean tiedToPrevious() { + return iter.doubleValue() == prevDist; + } + + @Deprecated + @Override + public int hashCode() { + return super.hashCode(); + } + + @Deprecated + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/FilteredDistanceResultAdapter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/FilteredDistanceResultAdapter.java new file mode 100644 index 00000000..9a8a2ddc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/FilteredDistanceResultAdapter.java @@ -0,0 +1,76 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; + +/** + * This adapter is used to process a list of (double, DBID) objects, but allows + * skipping one object in the ranking. The list <em>must</em> be sorted + * appropriately, the score is only used to detect ties. + * + * @author Erich Schubert + */ +public class FilteredDistanceResultAdapter extends DistanceResultAdapter { + /** + * DBID to skip (usually: query object). + */ + DBIDRef skip; + + /** + * Constructor + * + * @param iter Iterator for distance results + * @param skip DBID to skip (reference must remain stable!) + */ + public FilteredDistanceResultAdapter(DoubleDBIDListIter iter, DBIDRef skip) { + super(iter); + this.skip = skip; + if(iter.valid() && DBIDUtil.equal(iter, skip)) { + iter.advance(); + } + } + + @Override + public DistanceResultAdapter advance() { + super.advance(); + if(iter.valid() && DBIDUtil.equal(iter, skip)) { + iter.advance(); + } + return this; + } + + @Deprecated + @Override + public int hashCode() { + return super.hashCode(); + } + + @Deprecated + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/IncreasingVectorIter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/IncreasingVectorIter.java new file mode 100644 index 00000000..979dc166 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/IncreasingVectorIter.java @@ -0,0 +1,116 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.ScoreIter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; +import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.ArrayIter; + +/** + * Class to iterate over a number vector in decreasing order. + * + * @author Erich Schubert + * + * @apiviz.composedOf NumberVector + */ +public class IncreasingVectorIter implements ScoreIter, IntegerComparator, ArrayIter { + /** + * Order of dimensions. + */ + private int[] sort; + + /** + * Data vector. + */ + private NumberVector vec; + + /** + * Current position. + */ + int pos = 0; + + /** + * Constructor. + * + * @param vec Vector to iterate over. + */ + public IncreasingVectorIter(NumberVector vec) { + this.vec = vec; + final int dim = vec.getDimensionality(); + this.sort = new int[dim]; + for(int d = 0; d < dim; d++) { + sort[d] = d; + } + IntegerArrayQuickSort.sort(sort, this); + } + + @Override + public int compare(int x, int y) { + return Double.compare(vec.doubleValue(x), vec.doubleValue(y)); + } + + public int dim() { + return sort[pos]; + } + + @Override + public boolean valid() { + return pos < vec.getDimensionality(); + } + + @Override + public IncreasingVectorIter advance() { + ++pos; + return this; + } + + @Override + public boolean tiedToPrevious() { + return pos > 0 && Double.compare(vec.doubleValue(sort[pos]), vec.doubleValue(sort[pos - 1])) == 0; + } + + @Override + public int getOffset() { + return pos; + } + + @Override + public IncreasingVectorIter advance(int count) { + pos += count; + return this; + } + + @Override + public IncreasingVectorIter retract() { + pos--; + return this; + } + + @Override + public IncreasingVectorIter seek(int off) { + pos = off; + return this; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/OutlierScoreAdapter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/OutlierScoreAdapter.java new file mode 100644 index 00000000..a36a57d6 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/OutlierScoreAdapter.java @@ -0,0 +1,103 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.ScoreIter; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; + +/** + * This adapter can be used for an arbitrary collection of Integers, and uses + * that id1.compareTo(id2) != 0 for id1 != id2 to satisfy the comparability. + * + * Note that of course, no id should occur more than once. + * + * The ROC values would be incorrect then anyway! + * + * @author Erich Schubert + * + * @apiviz.composedOf OutlierResult + */ +public class OutlierScoreAdapter implements ScoreIter, DBIDRefIter { + /** + * Original iterator. + */ + private DBIDIter iter; + + /** + * Outlier score. + */ + private DoubleRelation scores; + + /** + * Previous value. + */ + double prev = Double.NaN; + + /** + * Constructor. + * + * @param o Result + */ + public OutlierScoreAdapter(OutlierResult o) { + super(); + this.iter = o.getOrdering().iter(o.getScores().getDBIDs()).iter(); + this.scores = o.getScores(); + } + + @Override + public boolean valid() { + return iter.valid(); + } + + @Override + public OutlierScoreAdapter advance() { + prev = scores.doubleValue(iter); + iter.advance(); + return this; + } + + @Override + public boolean tiedToPrevious() { + return scores.doubleValue(iter) == prev; + } + + @Override + public DBIDRef getRef() { + return iter; + } + + @Deprecated + @Override + public int hashCode() { + return super.hashCode(); + } + + @Deprecated + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/SimpleAdapter.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/SimpleAdapter.java new file mode 100644 index 00000000..beb301a8 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/SimpleAdapter.java @@ -0,0 +1,89 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.ScoreIter; + +/** + * This adapter can be used for an arbitrary collection of Integers, and uses + * that id1.compareTo(id2) != 0 for id1 != id2 to satisfy the comparability. + * + * Note that of course, no id should occur more than once. + * + * The ROC values would be incorrect then anyway! + * + * @author Erich Schubert + * + * @apiviz.composedOf DBIDIter + */ +public class SimpleAdapter implements ScoreIter, DBIDRefIter { + /** + * Original Iterator + */ + private DBIDIter iter; + + /** + * Constructor + * + * @param iter Iterator for object IDs + */ + public SimpleAdapter(DBIDIter iter) { + super(); + this.iter = iter; + } + + @Override + public boolean valid() { + return iter.valid(); + } + + @Override + public SimpleAdapter advance() { + iter.advance(); + return this; + } + + @Override + public boolean tiedToPrevious() { + return false; // No information. + } + + @Deprecated + @Override + public int hashCode() { + return super.hashCode(); + } + + @Deprecated + @Override + public boolean equals(Object obj) { + return super.equals(obj); + } + + @Override + public DBIDRef getRef() { + return iter; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorNonZero.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorNonZero.java new file mode 100644 index 00000000..cc97006f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorNonZero.java @@ -0,0 +1,44 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.NumberVector; + +/** + * Class that uses a NumberVector as reference, and considers all non-zero + * values as positive entries. + * + * @apiviz.composedOf NumberVector + * + * @author Erich Schubert + */ +public class VectorNonZero extends VectorOverThreshold { + /** + * Constructor. + * + * @param vec Reference vector. + */ + public VectorNonZero(NumberVector vec) { + super(vec, 0.); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorOverThreshold.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorOverThreshold.java new file mode 100644 index 00000000..99c24332 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorOverThreshold.java @@ -0,0 +1,79 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.Predicate; + +/** + * Class that uses a NumberVector as reference, and considers all non-zero + * values as positive entries. + * + * @apiviz.composedOf NumberVector + * + * @author Erich Schubert + */ +public class VectorOverThreshold implements Predicate<DecreasingVectorIter> { + /** + * Vector to use as reference + */ + NumberVector vec; + + /** + * Threshold + */ + double threshold; + + /** + * Number of positive values. + */ + int numpos; + + /** + * Constructor. + * + * @param vec Reference vector. + * @param threshold Threshold value. + */ + public VectorOverThreshold(NumberVector vec, double threshold) { + super(); + this.vec = vec; + this.threshold = threshold; + this.numpos = 0; + for(int i = 0, l = vec.getDimensionality(); i < l; i++) { + if(vec.doubleValue(i) > threshold) { + ++numpos; + } + } + } + + @Override + public boolean test(DecreasingVectorIter o) { + return Math.abs(vec.doubleValue(o.dim())) > threshold; + } + + @Override + public int numPositive() { + return numpos; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorZero.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorZero.java new file mode 100644 index 00000000..e7b728fc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/VectorZero.java @@ -0,0 +1,71 @@ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation.Predicate; + +/** + * Class that uses a NumberVector as reference, and considers all zero values as + * positive entries. + * + * @apiviz.composedOf NumberVector + * + * @author Erich Schubert + */ +public class VectorZero implements Predicate<IncreasingVectorIter> { + /** + * Vector to use as reference + */ + NumberVector vec; + + /** + * Number of positive values. + */ + int numpos; + + /** + * Constructor. + * + * @param vec Reference vector. + */ + public VectorZero(NumberVector vec) { + this.vec = vec; + this.numpos = 0; + for(int i = 0, l = vec.getDimensionality(); i < l; i++) { + if(Math.abs(vec.doubleValue(i)) < Double.MIN_NORMAL) { + ++numpos; + } + } + } + + @Override + public boolean test(IncreasingVectorIter o) { + return Math.abs(vec.doubleValue(o.dim())) < Double.MIN_NORMAL; + } + + @Override + public int numPositive() { + return numpos; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/package-info.java new file mode 100644 index 00000000..746d9a1c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/adapter/package-info.java @@ -0,0 +1,26 @@ +/** + * Adapter classes for ranking and scoring measures. + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.evaluation.scores.adapter;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/scores/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/scores/package-info.java new file mode 100644 index 00000000..d409b7f9 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/scores/package-info.java @@ -0,0 +1,28 @@ +/** + * <p>Evaluation of rankings and scorings.</p> + * + * @apiviz.exclude java.util.* + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.evaluation.scores;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java b/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java index 90a11583..e09dba3a 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/ComputeSimilarityMatrixImage.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.evaluation.similaritymatrix; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -42,7 +42,6 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; @@ -90,7 +89,7 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { /** * The distance function to use */ - private DistanceFunction<? super O, ? extends NumberDistance<?, ?>> distanceFunction; + private DistanceFunction<? super O> distanceFunction; /** * Scaling function to use @@ -109,7 +108,7 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { * @param scaling Scaling function to use for contrast * @param skipzero Skip zero values when scaling. */ - public ComputeSimilarityMatrixImage(DistanceFunction<? super O, ? extends NumberDistance<?, ?>> distanceFunction, ScalingFunction scaling, boolean skipzero) { + public ComputeSimilarityMatrixImage(DistanceFunction<? super O> distanceFunction, ScalingFunction scaling, boolean skipzero) { super(); this.distanceFunction = distanceFunction; this.scaling = scaling; @@ -131,7 +130,7 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { if(order.size() != relation.size()) { throw new IllegalStateException("Iterable result doesn't match database size - incomplete ordering?"); } - DistanceQuery<O, ? extends NumberDistance<?, ?>> dq = distanceFunction.instantiate(relation); + DistanceQuery<O> dq = distanceFunction.instantiate(relation); final int size = order.size(); // When the logging is in the outer loop, it's just 2*size (providing enough @@ -148,16 +147,14 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { for(; id1.valid(); id1.advance()) { id2.seek(id1.getOffset()); for(; id2.valid(); id2.advance()) { - final double dist = dq.distance(id1, id2).doubleValue(); + final double dist = dq.distance(id1, id2); if(!Double.isNaN(dist) && !Double.isInfinite(dist) /* && dist > 0.0 */) { if(!skipzero || dist > 0.0) { minmax.put(dist); } } } - if(prog != null) { - prog.incrementProcessed(LOG); - } + LOG.incrementProcessed(prog); } } @@ -173,7 +170,7 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { for(int x = 0; x < size && id1.valid(); x++, id1.advance()) { id2.seek(id1.getOffset()); for(int y = x; y < size && id2.valid(); y++, id2.advance()) { - double ddist = dq.distance(id1, id2).doubleValue(); + double ddist = dq.distance(id1, id2); if(ddist > 0.0) { ddist = scale.getScaled(ddist); } @@ -186,14 +183,10 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { img.setRGB(x, y, col); img.setRGB(y, x, col); } - if(prog != null) { - prog.incrementProcessed(LOG); - } + LOG.incrementProcessed(prog); } } - if(prog != null) { - prog.ensureCompleted(LOG); - } + LOG.ensureCompleted(prog); return new SimilarityMatrix(img, relation, order); } @@ -338,7 +331,7 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { /** * The distance function to use */ - private DistanceFunction<O, ? extends NumberDistance<?, ?>> distanceFunction; + private DistanceFunction<O> distanceFunction; /** * Scaling function to use @@ -353,7 +346,7 @@ public class ComputeSimilarityMatrixImage<O> implements Evaluator { @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<DistanceFunction<O, ? extends NumberDistance<?, ?>>> distanceFunctionP = AbstractAlgorithm.makeParameterDistanceFunction(EuclideanDistanceFunction.class, DistanceFunction.class); + ObjectParameter<DistanceFunction<O>> distanceFunctionP = AbstractAlgorithm.makeParameterDistanceFunction(EuclideanDistanceFunction.class, DistanceFunction.class); if(config.grab(distanceFunctionP)) { distanceFunction = distanceFunctionP.instantiateClass(config); } diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/package-info.java index 4acc0ec6..bcaf0520 100644 --- a/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/evaluation/similaritymatrix/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |