package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
import de.lmu.ifi.dbs.elki.database.ids.KNNList;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
* Connectivity-based outlier factor (COF).
*
* Reference:
*
* J. Tang, Z. Chen, A. W. C. Fu, D. W. Cheung
* Enhancing effectiveness of outlier detections for low density patterns.
* In Advances in Knowledge Discovery and Data Mining.
*
*
* @author Erich Schubert
*
* @param Object type
*/
@Reference(authors = "J. Tang, Z. Chen, A. W. C. Fu, D. W. Cheung", //
title = "Enhancing effectiveness of outlier detections for low density patterns", //
booktitle = "In Advances in Knowledge Discovery and Data Mining", //
url = "http://dx.doi.org/10.1007/3-540-47887-6_53")
public class COF extends AbstractDistanceBasedAlgorithm implements OutlierAlgorithm {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(COF.class);
/**
* The number of neighbors to query (including the query point!)
*/
protected int k;
/**
* Constructor.
*
* @param k the number of neighbors to use for comparison (excluding the query
* point)
* @param distanceFunction the neighborhood distance function
*/
public COF(int k, DistanceFunction super O> distanceFunction) {
super(distanceFunction);
this.k = k + 1; // + query point
}
/**
* Runs the COF algorithm on the given database.
*
* @param database Database to query
* @param relation Data to process
* @return COF outlier result
*/
public OutlierResult run(Database database, Relation relation) {
StepProgress stepprog = LOG.isVerbose() ? new StepProgress("COF", 3) : null;
DistanceQuery dq = database.getDistanceQuery(relation, getDistanceFunction());
LOG.beginStep(stepprog, 1, "Materializing COF neighborhoods.");
KNNQuery knnq = DatabaseUtil.precomputedKNNQuery(database, relation, dq, k);
DBIDs ids = relation.getDBIDs();
LOG.beginStep(stepprog, 2, "Computing Average Chaining Distances.");
WritableDoubleDataStore acds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
computeAverageChainingDistances(knnq, dq, ids, acds);
// compute COF_SCORE of each db object
LOG.beginStep(stepprog, 3, "Computing Connectivity-based Outlier Factors.");
WritableDoubleDataStore cofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_DB);
// track the maximum value for normalization.
DoubleMinMax cofminmax = new DoubleMinMax();
computeCOFScores(knnq, ids, acds, cofs, cofminmax);
LOG.setCompleted(stepprog);
// Build result representation.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Connectivity-Based Outlier Factor", "cof-outlier", cofs, ids);
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(cofminmax.getMin(), cofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
return new OutlierResult(scoreMeta, scoreResult);
}
/**
* Computes the average chaining distance, the average length of a path
* through the given set of points to each target. The authors of COF decided
* to approximate this value using a weighted mean that assumes every object
* is reached from the previous point (but actually every point could be best
* reachable from the first, in which case this does not make much sense.)
*
* TODO: can we accelerate this by using the kNN of the neighbors?
*
* @param knnq KNN query
* @param dq Distance query
* @param ids IDs to process
* @param acds Storage for average chaining distances
*/
protected void computeAverageChainingDistances(KNNQuery knnq, DistanceQuery dq, DBIDs ids, WritableDoubleDataStore acds) {
FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("Computing average chaining distances", ids.size(), LOG) : null;
// Compute the chaining distances.
// We do not bother to materialize the chaining order.
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
final KNNList neighbors = knnq.getKNNForDBID(iter, k);
final int r = neighbors.size();
DoubleDBIDListIter it1 = neighbors.iter(), it2 = neighbors.iter();
// Store the current lowest reachability.
final double[] mindists = new double[r];
for(int i = 0; it1.valid(); it1.advance(), ++i) {
mindists[i] = DBIDUtil.equal(it1, iter) ? Double.NaN : it1.doubleValue();
}
double acsum = 0.;
for(int j = ((r < k) ? r : k) - 1; j > 0; --j) {
// Find the minimum:
int minpos = -1;
double mindist = Double.NaN;
for(int i = 0; i < mindists.length; ++i) {
double curdist = mindists[i];
// Both values could be NaN, deliberately.
if(curdist == curdist && !(curdist > mindist)) {
minpos = i;
mindist = curdist;
}
}
acsum += mindist * j; // Weighted sum, decreasing weights
mindists[minpos] = Double.NaN;
it1.seek(minpos);
// Update distances
it2.seek(0);
for(int i = 0; it2.valid(); it2.advance(), ++i) {
final double curdist = mindists[i];
if(curdist != curdist) {
continue; // NaN = processed!
}
double newdist = dq.distance(it1, it2);
if(newdist < curdist) {
mindists[i] = newdist;
}
}
}
acds.putDouble(iter, acsum / (r * 0.5 * (r - 1.)));
LOG.incrementProcessed(lrdsProgress);
}
LOG.ensureCompleted(lrdsProgress);
}
/**
* Compute Connectivity outlier factors.
*
* @param knnq KNN query
* @param ids IDs to process
* @param acds Average chaining distances
* @param cofs Connectivity outlier factor storage
* @param cofminmax Score minimum/maximum tracker
*/
private void computeCOFScores(KNNQuery knnq, DBIDs ids, DoubleDataStore acds, WritableDoubleDataStore cofs, DoubleMinMax cofminmax) {
FiniteProgress progressCOFs = LOG.isVerbose() ? new FiniteProgress("COF for objects", ids.size(), LOG) : null;
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
final KNNList neighbors = knnq.getKNNForDBID(iter, k);
// Aggregate the average chaining distances of all neighbors:
double sum = 0.;
for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
// skip the point itself
if(DBIDUtil.equal(neighbor, iter)) {
continue;
}
sum += acds.doubleValue(neighbor);
}
final double cof = (sum > 0.) ? (acds.doubleValue(iter) * k / sum) : (acds.doubleValue(iter) > 0. ? Double.POSITIVE_INFINITY : 1.);
cofs.putDouble(iter, cof);
// update minimum and maximum
cofminmax.put(cof);
LOG.incrementProcessed(progressCOFs);
}
LOG.ensureCompleted(progressCOFs);
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*
* @param Object type
*/
public static class Parameterizer extends AbstractDistanceBasedAlgorithm.Parameterizer {
/**
* Parameter to specify the neighborhood size for COF. This does not include
* the query object.
*/
public static final OptionID K_ID = new OptionID("cof.k", "The number of neighbors (not including the query object) to use for computing the COF score.");
/**
* The neighborhood size to use.
*/
protected int k;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter pK = new IntParameter(K_ID) //
.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(pK)) {
k = pK.intValue();
}
}
@Override
protected COF makeInstance() {
return new COF<>(k, distanceFunction);
}
}
}