package de.lmu.ifi.dbs.elki.evaluation.histogram; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.result.HierarchicalResult; import de.lmu.ifi.dbs.elki.result.HistogramResult; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultUtil; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram; import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjStaticHistogram; import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleDoublePair; import de.lmu.ifi.dbs.elki.utilities.scaling.IdentityScaling; import de.lmu.ifi.dbs.elki.utilities.scaling.ScalingFunction; import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction; /** * Compute a Histogram to evaluate a ranking algorithm. * * The parameter {@code -hist.positive} specifies the class label of "positive" * hits. * * @author Lisa Reichert * @author Erich Schubert * * @apiviz.landmark * @apiviz.uses OutlierResult * @apiviz.has ScalingFunction * @apiviz.has HistogramResult oneway - - «create» */ public class ComputeOutlierHistogram implements Evaluator { /** * The object pattern to identify positive classes *

* Key: {@code -comphist.positive} *

*/ public static final OptionID POSITIVE_CLASS_NAME_ID = new OptionID("comphist.positive", "Class label for the 'positive' class."); /** * number of bins for the histogram *

* Default value: {@link EuclideanDistanceFunction} *

* Key: {@code -comphist.bins} *

*/ public static final OptionID BINS_ID = new OptionID("comphist.bins", "number of bins"); /** * Parameter to specify a scaling function to use. *

* Key: {@code -comphist.scaling} *

*/ public static final OptionID SCALING_ID = new OptionID("comphist.scaling", "Class to use as scaling function."); /** * Flag to count frequencies of outliers and non-outliers separately *

* Key: {@code -histogram.splitfreq} *

*/ public static final OptionID SPLITFREQ_ID = new OptionID("histogram.splitfreq", "Use separate frequencies for outliers and non-outliers."); /** * Stores the "positive" class. */ private Pattern positiveClassName = null; /** * Number of bins */ private int bins; /** * Scaling function to use */ private ScalingFunction scaling; /** * Flag to make split frequencies */ private boolean splitfreq = false; /** * Constructor. * * @param positive_class_name Class name * @param bins Bins * @param scaling Scaling * @param splitfreq Scale inlier and outlier frequencies independently */ public ComputeOutlierHistogram(Pattern positive_class_name, int bins, ScalingFunction scaling, boolean splitfreq) { super(); this.positiveClassName = positive_class_name; this.bins = bins; this.scaling = scaling; this.splitfreq = splitfreq; } /** * Evaluate a single outlier result as histogram. * * @param database Database to process * @param or Outlier result * @return Result */ public HistogramResult evaluateOutlierResult(Database database, OutlierResult or) { if(scaling instanceof OutlierScalingFunction) { OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling; oscaling.prepare(or); } ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs()); DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName); // first value for outliers, second for each object // If we have useful (finite) min/max, use these for binning. double min = scaling.getMin(); double max = scaling.getMax(); final ObjHistogram hist; if(Double.isInfinite(min) || Double.isNaN(min) || Double.isInfinite(max) || Double.isNaN(max)) { hist = new AbstractObjDynamicHistogram(bins) { @Override public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) { first.first += second.first; first.second += second.second; return first; } @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override protected DoubleDoublePair cloneForCache(DoubleDoublePair data) { return new DoubleDoublePair(data.first, data.second); } @Override protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) { DoubleDoublePair sum = new DoubleDoublePair(0, 0); for(int i = start; i < end; i++) { DoubleDoublePair p = (DoubleDoublePair) data[i]; if(p != null) { sum.first += p.first; sum.second += p.second; } } return sum; } }; } else { hist = new AbstractObjStaticHistogram(bins, min, max) { @Override protected DoubleDoublePair makeObject() { return new DoubleDoublePair(0., 0.); } @Override public void putData(double coord, DoubleDoublePair data) { DoubleDoublePair exist = get(coord); exist.first += data.first; exist.second += data.second; } }; } // first fill histogram only with values of outliers DoubleDoublePair negative, positive; if(!splitfreq) { negative = new DoubleDoublePair(1. / ids.size(), 0); positive = new DoubleDoublePair(0, 1. / ids.size()); } else { negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0); positive = new DoubleDoublePair(0, 1. / outlierIds.size()); } ids.removeDBIDs(outlierIds); // fill histogram with values of each object for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { double result = or.getScores().get(iter); result = scaling.getScaled(result); hist.putData(result, negative); } for(DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) { double result = or.getScores().get(iter); result = scaling.getScaled(result); hist.putData(result, positive); } Collection collHist = new ArrayList<>(hist.getNumBins()); for(ObjHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) { DoubleDoublePair data = iter.getValue(); DoubleVector row = new DoubleVector(new double[] { iter.getCenter(), data.first, data.second }); collHist.add(row); } return new HistogramResult<>("Outlier Score Histogram", "outlier-histogram", collHist); } @Override public void processNewResult(HierarchicalResult baseResult, Result result) { final Database db = ResultUtil.findDatabase(baseResult); List ors = ResultUtil.filterResults(result, OutlierResult.class); if(ors == null || ors.size() <= 0) { // logger.warning("No outlier results found for "+ComputeOutlierHistogram.class.getSimpleName()); return; } for(OutlierResult or : ors) { db.getHierarchy().add(or, evaluateOutlierResult(db, or)); } } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { /** * Stores the "positive" class. */ protected Pattern positiveClassName = null; /** * Number of bins */ protected int bins; /** * Scaling function to use */ protected ScalingFunction scaling; /** * Flag to make split frequencies */ protected boolean splitfreq = false; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID); positiveClassNameP.setOptional(true); if(config.grab(positiveClassNameP)) { positiveClassName = positiveClassNameP.getValue(); } IntParameter binsP = new IntParameter(BINS_ID, 50); binsP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if(config.grab(binsP)) { bins = binsP.getValue(); } ObjectParameter scalingP = new ObjectParameter<>(SCALING_ID, ScalingFunction.class, IdentityScaling.class); if(config.grab(scalingP)) { scaling = scalingP.instantiateClass(config); } Flag splitfreqF = new Flag(SPLITFREQ_ID); if(config.grab(splitfreqF)) { splitfreq = splitfreqF.getValue(); } } @Override protected ComputeOutlierHistogram makeInstance() { return new ComputeOutlierHistogram(positiveClassName, bins, scaling, splitfreq); } } }