package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.Arrays;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.DoubleIntegerArrayQuickSort;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
* Fast Outlier Detection Using the "Local Correlation Integral".
*
* Exact implementation only, not aLOCI. See {@link ALOCI}.
*
* Outlier detection using multiple epsilon neighborhoods.
*
* This implementation has O(n3 log n) runtime complexity!
*
* Based on: S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos:
* LOCI: Fast Outlier Detection Using the Local Correlation Integral. In: Proc.
* 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003.
*
* @author Erich Schubert
*
* @apiviz.has RangeQuery
*
* @param Object type
*/
@Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral")
@Description("Algorithm to compute outliers based on the Local Correlation Integral")
@Reference(authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802")
@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI" })
public class LOCI extends AbstractDistanceBasedAlgorithm implements OutlierAlgorithm {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(LOCI.class);
/**
* Parameter to specify the maximum radius of the neighborhood to be
* considered, must be suitable to the distance function specified.
*/
public static final OptionID RMAX_ID = new OptionID("loci.rmax", "The maximum radius of the neighborhood to be considered.");
/**
* Parameter to specify the minimum neighborhood size
*/
public static final OptionID NMIN_ID = new OptionID("loci.nmin", "Minimum neighborhood size to be considered.");
/**
* Parameter to specify the averaging neighborhood scaling.
*/
public static final OptionID ALPHA_ID = new OptionID("loci.alpha", "Scaling factor for averaging neighborhood");
/**
* Holds the value of {@link #RMAX_ID}.
*/
private double rmax;
/**
* Holds the value of {@link #NMIN_ID}.
*/
private int nmin;
/**
* Holds the value of {@link #ALPHA_ID}.
*/
private double alpha;
/**
* Constructor.
*
* @param distanceFunction Distance function
* @param rmax Maximum radius
* @param nmin Minimum neighborhood size
* @param alpha Alpha value
*/
public LOCI(DistanceFunction super O> distanceFunction, double rmax, int nmin, double alpha) {
super(distanceFunction);
this.rmax = rmax;
this.nmin = nmin;
this.alpha = alpha;
}
/**
* Run the algorithm
*
* @param database Database to process
* @param relation Relation to process
* @return Outlier result
*/
public OutlierResult run(Database database, Relation relation) {
DistanceQuery distFunc = database.getDistanceQuery(relation, getDistanceFunction());
RangeQuery rangeQuery = database.getRangeQuery(distFunc);
DBIDs ids = relation.getDBIDs();
// LOCI preprocessing step
WritableDataStore interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class);
precomputeInterestingRadii(ids, rangeQuery, interestingDistances);
// LOCI main step
FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null;
WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
DoubleMinMax minmax = new DoubleMinMax();
// Shared instance, to save allocations.
MeanVariance mv_n_r_alpha = new MeanVariance();
for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
final DoubleIntArrayList cdist = interestingDistances.get(iditer);
final double maxdist = cdist.getDouble(cdist.size() - 1);
final int maxneig = cdist.getInt(cdist.size() - 1);
double maxmdefnorm = 0.0;
double maxnormr = 0;
if(maxneig >= nmin) {
// Compute the largest neighborhood we will need.
DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist);
// TODO: Ensure the result is sorted. This is currently implied.
// For any critical distance, compute the normalized MDEF score.
for(int i = 0, size = cdist.size(); i < size; i++) {
// Only start when minimum size is fulfilled
if(cdist.getInt(i) < nmin) {
continue;
}
final double r = cdist.getDouble(i);
final double alpha_r = alpha * r;
// compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!)
final int n_alphar = cdist.getInt(cdist.find(alpha_r));
// compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF}
mv_n_r_alpha.reset();
for(DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) {
// Stop at radius r
if(neighbor.doubleValue() > r) {
break;
}
DoubleIntArrayList cdist2 = interestingDistances.get(neighbor);
int rn_alphar = cdist2.getInt(cdist2.find(alpha_r));
mv_n_r_alpha.put(rn_alphar);
}
// We only use the average and standard deviation
final double nhat_r_alpha = mv_n_r_alpha.getMean();
final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev();
// Redundant divisions by nhat_r_alpha removed.
final double mdef = nhat_r_alpha - n_alphar;
final double sigmamdef = sigma_nhat_r_alpha;
final double mdefnorm = mdef / sigmamdef;
if(mdefnorm > maxmdefnorm) {
maxmdefnorm = mdefnorm;
maxnormr = r;
}
}
}
else {
// FIXME: when nmin was not fulfilled - what is the proper value then?
maxmdefnorm = Double.POSITIVE_INFINITY;
maxnormr = maxdist;
}
mdef_norm.putDouble(iditer, maxmdefnorm);
mdef_radius.putDouble(iditer, maxnormr);
minmax.put(maxmdefnorm);
LOG.incrementProcessed(progressLOCI);
}
LOG.ensureCompleted(progressLOCI);
DoubleRelation scoreResult = new MaterializedDoubleRelation("LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0);
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
result.addChildResult(new MaterializedDoubleRelation("LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs()));
return result;
}
/**
* Preprocessing step: determine the radii of interest for each point.
*
* @param ids IDs to process
* @param rangeQuery Range query
* @param interestingDistances Distances of interest
*/
protected void precomputeInterestingRadii(DBIDs ids, RangeQuery rangeQuery, WritableDataStore interestingDistances) {
FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null;
for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
// build list of critical distances
DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1);
{
int i = 0;
DoubleDBIDListIter ni = neighbors.iter();
while(ni.valid()) {
final double curdist = ni.doubleValue();
++i;
ni.advance();
// Skip, if tied to the next object:
if(ni.valid() && curdist == ni.doubleValue()) {
continue;
}
cdist.append(curdist, i);
// Scale radius, and reinsert
if(alpha != 1.) {
final double ri = curdist / alpha;
if(ri <= rmax) {
cdist.append(ri, Integer.MIN_VALUE);
}
}
}
}
cdist.sort();
// fill the gaps to have fast lookups of number of neighbors at a given
// distance.
int lastk = 0;
for(int i = 0, size = cdist.size(); i < size; i++) {
final int k = cdist.getInt(i);
if(k == Integer.MIN_VALUE) {
cdist.setValue(i, lastk);
}
else {
lastk = k;
}
}
// TODO: shrink the list, removing duplicate radii?
interestingDistances.put(iditer, cdist);
LOG.incrementProcessed(progressPreproc);
}
LOG.ensureCompleted(progressPreproc);
}
/**
* Array of double-int values.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected static class DoubleIntArrayList {
/**
* Double keys
*/
double[] keys;
/**
* Integer values
*/
int[] vals;
/**
* Used size
*/
int size = 0;
/**
* Constructor.
*
* @param alloc Initial allocation.
*/
public DoubleIntArrayList(int alloc) {
keys = new double[alloc];
vals = new int[alloc];
size = 0;
}
/**
* Collection size.
*
* @return Size
*/
public int size() {
return size;
}
/**
* Get the key at the given position.
*
* @param i Position
* @return Key
*/
public double getDouble(int i) {
return keys[i];
}
/**
* Get the value at the given position.
*
* @param i Position
* @return Value
*/
public int getInt(int i) {
return vals[i];
}
/**
* Get the value at the given position.
*
* @param i Position
* @param val New value
*/
public void setValue(int i, int val) {
vals[i] = val;
}
/**
* Append a key-value pair.
*
* @param key Key to append
* @param val Value to append.
*/
public void append(double key, int val) {
if(size == keys.length) {
keys = Arrays.copyOf(keys, size << 1);
vals = Arrays.copyOf(vals, size << 1);
}
keys[size] = key;
vals[size] = val;
++size;
}
/**
* Find the last position with a smaller or equal key.
*
* @param search Key
* @return Position
*/
public int find(final double search) {
int a = 0, b = size - 1;
while(a <= b) {
final int mid = (a + b) >>> 1;
final double cur = keys[mid];
if(cur > search) {
b = mid - 1;
}
else { // less or equal!
a = mid + 1;
}
}
return b;
}
/**
* Sort the array list.
*/
public void sort() {
DoubleIntegerArrayQuickSort.sort(keys, vals, size);
}
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*
* @param Object type
*/
public static class Parameterizer extends AbstractDistanceBasedAlgorithm.Parameterizer {
protected double rmax;
protected int nmin = 0;
protected double alpha = 0.5;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final DoubleParameter rmaxP = new DoubleParameter(RMAX_ID);
if(config.grab(rmaxP)) {
rmax = rmaxP.doubleValue();
}
final IntParameter nminP = new IntParameter(NMIN_ID, 20);
if(config.grab(nminP)) {
nmin = nminP.intValue();
}
final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5);
if(config.grab(alphaP)) {
alpha = alphaP.getValue();
}
}
@Override
protected LOCI makeInstance() {
return new LOCI<>(distanceFunction, rmax, nmin, alpha);
}
}
}