package de.lmu.ifi.dbs.elki.algorithm.outlier; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * LoOP: Local Outlier Probabilities * * Distance/density based algorithm similar to LOF to detect outliers, but with * statistical methods to achieve better result stability. * * @author Erich Schubert * * @apiviz.has KNNQuery * * @param the type of DatabaseObjects handled by this Algorithm */ @Title("LoOP: Local Outlier Probabilities") @Description("Variant of the LOF algorithm normalized using statistical values.") @Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "LoOP: Local Outlier Probabilities", booktitle = "Proceedings of the 18th International Conference on Information and Knowledge Management (CIKM), Hong Kong, China, 2009", url = "http://dx.doi.org/10.1145/1645953.1646195") public class LoOP> extends AbstractAlgorithm implements OutlierAlgorithm { /** * The logger for this class. */ private static final Logging logger = Logging.getLogger(LoOP.class); /** * The distance function to determine the reachability distance between * database objects. */ public static final OptionID REACHABILITY_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("loop.referencedistfunction", "Distance function to determine the density of an object."); /** * The distance function to determine the reachability distance between * database objects. */ public static final OptionID COMPARISON_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("loop.comparedistfunction", "Distance function to determine the reference set of an object."); /** * Parameter to specify the number of nearest neighbors of an object to be * considered for computing its LOOP_SCORE, must be an integer greater than 1. */ public static final OptionID KREACH_ID = OptionID.getOrCreateOptionID("loop.kref", "The number of nearest neighbors of an object to be used for the PRD value."); /** * Parameter to specify the number of nearest neighbors of an object to be * considered for computing its LOOP_SCORE, must be an integer greater than 1. */ public static final OptionID KCOMP_ID = OptionID.getOrCreateOptionID("loop.kcomp", "The number of nearest neighbors of an object to be considered for computing its LOOP_SCORE."); /** * Parameter to specify the number of nearest neighbors of an object to be * considered for computing its LOOP_SCORE, must be an integer greater than 1. */ public static final OptionID LAMBDA_ID = OptionID.getOrCreateOptionID("loop.lambda", "The number of standard deviations to consider for density computation."); /** * Holds the value of {@link #KREACH_ID}. */ int kreach; /** * Holds the value of {@link #KCOMP_ID}. */ int kcomp; /** * Hold the value of {@link #LAMBDA_ID}. */ double lambda; /** * Preprocessor Step 1 */ protected DistanceFunction reachabilityDistanceFunction; /** * Preprocessor Step 2 */ protected DistanceFunction comparisonDistanceFunction; /** * Include object itself in kNN neighborhood. */ static boolean objectIsInKNN = false; /** * Constructor with parameters. * * @param kreach * @param kcomp * @param reachabilityDistanceFunction * @param comparisonDistanceFunction * @param lambda */ public LoOP(int kreach, int kcomp, DistanceFunction reachabilityDistanceFunction, DistanceFunction comparisonDistanceFunction, double lambda) { super(); this.kreach = kreach; this.kcomp = kcomp; this.reachabilityDistanceFunction = reachabilityDistanceFunction; this.comparisonDistanceFunction = comparisonDistanceFunction; this.lambda = lambda; } /** * Get the kNN queries for the algorithm. * * @param database Database * @param stepprog Progress logger * @return result */ protected Pair, KNNQuery> getKNNQueries(Database database, Relation relation, StepProgress stepprog) { KNNQuery knnComp; KNNQuery knnReach; if(comparisonDistanceFunction == reachabilityDistanceFunction || comparisonDistanceFunction.equals(reachabilityDistanceFunction)) { // We need each neighborhood twice - use "HEAVY" flag. knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, Math.max(kreach, kcomp), DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); // No optimized kNN query - use a preprocessor! if(knnComp == null) { if(stepprog != null) { stepprog.beginStep(1, "Materializing neighborhoods with respect to reference neighborhood distance function.", logger); } MaterializeKNNPreprocessor preproc = new MaterializeKNNPreprocessor(relation, comparisonDistanceFunction, kcomp); database.addIndex(preproc); DistanceQuery cdq = database.getDistanceQuery(relation, comparisonDistanceFunction); knnComp = preproc.getKNNQuery(cdq, kreach, DatabaseQuery.HINT_HEAVY_USE); } else { if(stepprog != null) { stepprog.beginStep(1, "Optimized neighborhoods provided by database.", logger); } } knnReach = knnComp; } else { if(stepprog != null) { stepprog.beginStep(1, "Not materializing distance functions, since we request each DBID once only.", logger); } knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, kreach); knnReach = QueryUtil.getKNNQuery(relation, reachabilityDistanceFunction, kcomp); } return new Pair, KNNQuery>(knnComp, knnReach); } /** * Performs the LoOP algorithm on the given database. */ public OutlierResult run(Database database, Relation relation) throws IllegalStateException { final double sqrt2 = Math.sqrt(2.0); StepProgress stepprog = logger.isVerbose() ? new StepProgress(5) : null; Pair, KNNQuery> pair = getKNNQueries(database, relation, stepprog); KNNQuery knnComp = pair.getFirst(); KNNQuery knnReach = pair.getSecond(); // Assert we got something if(knnComp == null) { throw new AbortException("No kNN queries supported by database for comparison distance function."); } if(knnReach == null) { throw new AbortException("No kNN queries supported by database for density estimation distance function."); } // Probabilistic distances WritableDoubleDataStore pdists = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); {// computing PRDs if(stepprog != null) { stepprog.beginStep(3, "Computing pdists", logger); } FiniteProgress prdsProgress = logger.isVerbose() ? new FiniteProgress("pdists", relation.size(), logger) : null; for(DBID id : relation.iterDBIDs()) { final KNNResult neighbors = knnReach.getKNNForDBID(id, kreach); double sqsum = 0.0; // use first kref neighbors as reference set int ks = 0; for(DistanceResultPair neighbor : neighbors) { if(objectIsInKNN || !neighbor.getDBID().equals(id)) { double d = neighbor.getDistance().doubleValue(); sqsum += d * d; ks++; if(ks >= kreach) { break; } } } double pdist = lambda * Math.sqrt(sqsum / ks); pdists.putDouble(id, pdist); if(prdsProgress != null) { prdsProgress.incrementProcessed(logger); } } } // Compute PLOF values. WritableDoubleDataStore plofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); MeanVariance mvplof = new MeanVariance(); {// compute LOOP_SCORE of each db object if(stepprog != null) { stepprog.beginStep(4, "Computing PLOF", logger); } FiniteProgress progressPLOFs = logger.isVerbose() ? new FiniteProgress("PLOFs for objects", relation.size(), logger) : null; for(DBID id : relation.iterDBIDs()) { final KNNResult neighbors = knnComp.getKNNForDBID(id, kcomp); MeanVariance mv = new MeanVariance(); // use first kref neighbors as comparison set. int ks = 0; for(DistanceResultPair neighbor1 : neighbors) { if(objectIsInKNN || !neighbor1.getDBID().equals(id)) { mv.put(pdists.doubleValue(neighbor1.getDBID())); ks++; if(ks >= kcomp) { break; } } } double plof = Math.max(pdists.doubleValue(id) / mv.getMean(), 1.0); if(Double.isNaN(plof) || Double.isInfinite(plof)) { plof = 1.0; } plofs.putDouble(id, plof); mvplof.put((plof - 1.0) * (plof - 1.0)); if(progressPLOFs != null) { progressPLOFs.incrementProcessed(logger); } } } double nplof = lambda * Math.sqrt(mvplof.getMean()); if(logger.isDebugging()) { logger.verbose("nplof normalization factor is " + nplof + " " + mvplof.getMean() + " " + mvplof.getSampleStddev()); } // Compute final LoOP values. WritableDoubleDataStore loops = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); {// compute LOOP_SCORE of each db object if(stepprog != null) { stepprog.beginStep(5, "Computing LoOP scores", logger); } FiniteProgress progressLOOPs = logger.isVerbose() ? new FiniteProgress("LoOP for objects", relation.size(), logger) : null; for(DBID id : relation.iterDBIDs()) { loops.putDouble(id, NormalDistribution.erf((plofs.doubleValue(id) - 1) / (nplof * sqrt2))); if(progressLOOPs != null) { progressLOOPs.incrementProcessed(logger); } } } if(stepprog != null) { stepprog.setCompleted(logger); } // Build result representation. Relation scoreResult = new MaterializedRelation("Local Outlier Probabilities", "loop-outlier", TypeUtil.DOUBLE, loops, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore(); return new OutlierResult(scoreMeta, scoreResult); } @Override public TypeInformation[] getInputTypeRestriction() { final TypeInformation type; if(reachabilityDistanceFunction.equals(comparisonDistanceFunction)) { type = reachabilityDistanceFunction.getInputTypeRestriction(); } else { type = new CombinedTypeInformation(reachabilityDistanceFunction.getInputTypeRestriction(), comparisonDistanceFunction.getInputTypeRestriction()); } return TypeUtil.array(type); } @Override protected Logging getLogger() { return logger; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractParameterizer { /** * Holds the value of {@link #KREACH_ID}. */ int kreach = 0; /** * Holds the value of {@link #KCOMP_ID}. */ int kcomp = 0; /** * Hold the value of {@link #LAMBDA_ID}. */ double lambda = 2.0; /** * Preprocessor Step 1 */ protected DistanceFunction reachabilityDistanceFunction = null; /** * Preprocessor Step 2 */ protected DistanceFunction comparisonDistanceFunction = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter kcompP = new IntParameter(KCOMP_ID, new GreaterConstraint(1)); if(config.grab(kcompP)) { kcomp = kcompP.getValue(); } final ObjectParameter> compDistP = new ObjectParameter>(COMPARISON_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); if(config.grab(compDistP)) { comparisonDistanceFunction = compDistP.instantiateClass(config); } final IntParameter kreachP = new IntParameter(KREACH_ID, new GreaterConstraint(1), true); if(config.grab(kreachP)) { kreach = kreachP.getValue(); } else { kreach = kcomp; } final ObjectParameter> reachDistP = new ObjectParameter>(REACHABILITY_DISTANCE_FUNCTION_ID, DistanceFunction.class, true); if(config.grab(reachDistP)) { reachabilityDistanceFunction = reachDistP.instantiateClass(config); } // TODO: make default 1.0? final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, new GreaterConstraint(0.0), 2.0); if(config.grab(lambdaP)) { lambda = lambdaP.getValue(); } } @Override protected LoOP makeInstance() { DistanceFunction realreach = (reachabilityDistanceFunction != null) ? reachabilityDistanceFunction : comparisonDistanceFunction; return new LoOP(kreach, kcomp, realreach, comparisonDistanceFunction, lambda); } } }