package de.lmu.ifi.dbs.elki.algorithm.outlier;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.generic.MaskedDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
/**
* Outlier detection algorithm using a mixture model approach. The data is
* modeled as a mixture of two distributions, a Gaussian distribution for
* ordinary data and a uniform distribution for outliers. At first all Objects
* are in the set of normal objects and the set of anomalous objects is empty.
* An iterative procedure then transfers objects from the ordinary set to the
* anomalous set if the transfer increases the overall likelihood of the data.
*
* Reference:
*
* E. Eskin
* Anomaly detection over noisy data using learned probability distributions.
* In Proc. of the Seventeenth International Conference on Machine Learning
* (ICML-2000).
*
*
* @author Lisa Reichert
* @since 0.3
*
* @param Vector Type
*/
@Title("Gaussian-Uniform Mixture Model Outlier Detection")
@Description("Fits a mixture model consisting of a Gaussian and a uniform distribution to the data.")
@Reference(prefix = "Generalization using the likelihood gain as outlier score of", //
authors = "E. Eskin", //
title = "Anomaly detection over noisy data using learned probability distributions", //
booktitle = "Proc. of the Seventeenth International Conference on Machine Learning (ICML-2000)")
public class GaussianUniformMixture extends AbstractAlgorithm implements OutlierAlgorithm {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(GaussianUniformMixture.class);
/**
* Small value to increment diagonally of a matrix in order to avoid
* singularity before building the inverse.
*/
private static final double SINGULARITY_CHEAT = 1E-9;
/**
* Holds the cutoff value.
*/
private double c;
/**
* log(l) precomputed
*/
private double logl;
/**
* log(1-l) precomputed
*/
private double logml;
/**
* Constructor with parameters.
*
* @param l l value
* @param c c value
*/
public GaussianUniformMixture(double l, double c) {
super();
this.logl = Math.log(l);
this.logml = Math.log(1 - l);
this.c = c;
}
/**
* Run the algorithm
*
* @param relation Data relation
* @return Outlier result
*/
public OutlierResult run(Relation relation) {
// Use an array list of object IDs for fast random access by an offset
ArrayDBIDs objids = DBIDUtil.ensureArray(relation.getDBIDs());
// A bit set to flag objects as anomalous, none at the beginning
long[] bits = BitsUtil.zero(objids.size());
// Positive masked collection
DBIDs normalObjs = new MaskedDBIDs(objids, bits, true);
// Positive masked collection
DBIDs anomalousObjs = new MaskedDBIDs(objids, bits, false);
// resulting scores
WritableDoubleDataStore oscores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// compute loglikelihood
double logLike = relation.size() * logml + loglikelihoodNormal(normalObjs, relation);
// LOG.debugFine("normalsize " + normalObjs.size() + " anormalsize " +
// anomalousObjs.size() + " all " + (anomalousObjs.size() +
// normalObjs.size()));
// LOG.debugFine(logLike + " loglike beginning" +
// loglikelihoodNormal(normalObjs, database));
DoubleMinMax minmax = new DoubleMinMax();
DBIDIter iter = objids.iter();
for(int i = 0; i < objids.size(); i++, iter.advance()) {
// LOG.debugFine("i " + i);
// Change mask to make the current object anomalous
BitsUtil.setI(bits, i);
// Compute new likelihoods
double currentLogLike = normalObjs.size() * logml + loglikelihoodNormal(normalObjs, relation) + anomalousObjs.size() * logl + loglikelihoodAnomalous(anomalousObjs);
// if the loglike increases more than a threshold, object stays in
// anomalous set and is flagged as outlier
final double loglikeGain = currentLogLike - logLike;
oscores.putDouble(iter, loglikeGain);
minmax.put(loglikeGain);
if(loglikeGain > c) {
// flag as outlier
// LOG.debugFine("Outlier: " + curid + " " + (currentLogLike -
// logLike));
// Update best logLike
logLike = currentLogLike;
}
else {
// LOG.debugFine("Inlier: " + curid + " " + (currentLogLike - logLike));
// undo bit set
BitsUtil.clearI(bits, i);
}
}
OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, 0.0);
DoubleRelation res = new MaterializedDoubleRelation("Gaussian Mixture Outlier Score", "gaussian-mixture-outlier", oscores, relation.getDBIDs());
return new OutlierResult(meta, res);
}
/**
* Loglikelihood anomalous objects. Uniform distribution
*
* @param anomalousObjs
* @return loglikelihood for anomalous objects
*/
private double loglikelihoodAnomalous(DBIDs anomalousObjs) {
int n = anomalousObjs.size();
return n * Math.log(1.0 / n);
}
/**
* Computes the loglikelihood of all normal objects. Gaussian model
*
* @param objids Object IDs for 'normal' objects.
* @param database Database
* @return loglikelihood for normal objects
*/
private double loglikelihoodNormal(DBIDs objids, Relation database) {
if(objids.isEmpty()) {
return 0;
}
CovarianceMatrix builder = CovarianceMatrix.make(database, objids);
Vector mean = builder.getMeanVector();
Matrix covarianceMatrix = builder.destroyToSampleMatrix();
// test singulaere matrix
Matrix covInv = covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse();
double covarianceDet = covarianceMatrix.det();
double fakt = 1.0 / Math.sqrt(MathUtil.powi(MathUtil.TWOPI, RelationUtil.dimensionality(database)) * covarianceDet);
// for each object compute probability and sum
double prob = 0;
for(DBIDIter iter = objids.iter(); iter.valid(); iter.advance()) {
Vector x = database.get(iter).getColumnVector().minusEquals(mean);
double mDist = x.transposeTimesTimes(covInv, x);
prob += Math.log(fakt * Math.exp(-mDist / 2.0));
}
return prob;
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* Parameter to specify the fraction of expected outliers.
*/
public static final OptionID L_ID = new OptionID("mmo.l", "expected fraction of outliers");
/**
* Parameter to specify the cutoff.
*/
public static final OptionID C_ID = new OptionID("mmo.c", "cutoff");
protected double l = 1E-7;
protected double c = 0;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final DoubleParameter lP = new DoubleParameter(C_ID, 1E-7);
if(config.grab(lP)) {
l = lP.getValue();
}
final DoubleParameter cP = new DoubleParameter(C_ID, 1E-7);
if(config.grab(cP)) {
c = cP.getValue();
}
}
@Override
protected GaussianUniformMixture makeInstance() {
return new GaussianUniformMixture<>(l, c);
}
}
}