package de.lmu.ifi.dbs.elki.algorithm.outlier;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
* Angle-Based Outlier Detection / Angle-Based Outlier Factor.
*
* Fast-ABOD (approximateABOF) version.
*
* H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in
* High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge
* Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008.
*
* @author Matthias Schubert (Original Code)
* @author Erich Schubert (ELKIfication)
*
* @param Vector type
*/
@Title("Approximate ABOD: Angle-Based Outlier Detection")
@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.")
@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946")
public class FastABOD> extends ABOD {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(FastABOD.class);
/**
* Number of nearest neighbors.
*/
protected int k;
/**
* Constructor for Angle-Based Outlier Detection (ABOD).
*
* @param kernelFunction kernel function to use
* @param k Number of nearest neighbors
*/
public FastABOD(SimilarityFunction super V, DoubleDistance> kernelFunction, int k) {
super(kernelFunction);
this.k = k;
}
/**
* Run Fast-ABOD on the data set.
*
* @param relation Relation to process
* @return Outlier detection result
*/
@Override
public OutlierResult run(Database db, Relation relation) {
DBIDs ids = relation.getDBIDs();
// Build a kernel matrix, to make O(n^3) slightly less bad.
SimilarityQuery sq = db.getSimilarityQuery(relation, kernelFunction);
KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
DoubleMinMax minmaxabod = new DoubleMinMax();
MeanVariance s = new MeanVariance();
for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) {
s.reset();
final double simAA = kernelMatrix.getSimilarity(pA, pA);
// Choose the k-min nearest
ComparableMaxHeap nn = new ComparableMaxHeap<>(k);
for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
if (DBIDUtil.equal(nB, pA)) {
continue;
}
double simBB = kernelMatrix.getSimilarity(nB, nB);
double simAB = kernelMatrix.getSimilarity(pA, nB);
double sqdAB = simAA + simBB - simAB - simAB;
if (!(sqdAB > 0.)) {
continue;
}
if (nn.size() < k) {
nn.add(DBIDUtil.newPair(sqdAB, nB));
} else if (sqdAB < nn.peek().doubleValue()) {
nn.replaceTopElement(DBIDUtil.newPair(sqdAB, nB));
}
}
for (ObjectHeap.UnsortedIter iB = nn.unsortedIter(); iB.valid(); iB.advance()) {
DoubleDBIDPair nB = iB.get();
double sqdAB = nB.doubleValue();
double simAB = kernelMatrix.getSimilarity(pA, nB);
if (!(sqdAB > 0.)) {
continue;
}
for (ObjectHeap.UnsortedIter iC = nn.unsortedIter(); iC.valid(); iC.advance()) {
DoubleDBIDPair nC = iC.get();
if (DBIDUtil.compare(nC, nB) < 0) {
continue;
}
double sqdAC = nC.doubleValue();
double simAC = kernelMatrix.getSimilarity(pA, nC);
if (!(sqdAC > 0.)) {
continue;
}
// Exploit bilinearity of scalar product:
// = -
// = - - +
// For computing variance, AA is a constant and can be ignored.
double simBC = kernelMatrix.getSimilarity(nB, nC);
double numerator = simBC - simAB - simAC; // + simAA;
double val = numerator / (sqdAB * sqdAC);
s.put(val, 1. / Math.sqrt(sqdAB * sqdAC));
}
}
// Sample variance probably would be correct, but the ABOD publication
// uses the naive variance.
final double abof = s.getNaiveVariance();
minmaxabod.put(abof);
abodvalues.putDouble(pA, abof);
}
// Build result representation.
Relation scoreResult = new MaterializedRelation<>("Angle-Based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
return new OutlierResult(scoreMeta, scoreResult);
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer> extends ABOD.Parameterizer {
/**
* Parameter for the nearest neighbors.
*/
public static final OptionID K_ID = new OptionID("fastabod.k", "Number of nearest neighbors to use for ABOD.");
/**
* Number of neighbors.
*/
protected int k;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kP = new IntParameter(K_ID);
if (config.grab(kP)) {
k = kP.intValue();
}
}
@Override
protected FastABOD makeInstance() {
return new FastABOD<>(kernelFunction, k);
}
}
}