package de.lmu.ifi.dbs.elki.datasource.filter.transform;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.Random;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorConversionFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MeanVarianceMinMax;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeSetGlobalConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.EqualSizeGlobalConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
/**
* A filter to perturb the values by adding micro-noise.
*
* The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a
* specified standard deviation or by a uniform distribution with a specified
* range. The standard deviation or the range can be scaled, attribute-wise, to
* a given percentage of the original standard deviation in the data
* distribution (assuming a Gaussian distribution there), or to a percentage of
* the extension in each attribute ({@code maximumValue - minimumValue}).
*
* This filter has a potentially wide use but has been implemented for the
* following publication:
*
* Reference:
*
* A. Zimek, R. J. G. B. Campello, J. Sander: Data Perturbation for Outlier
* Detection Ensembles.<\br> In: Proc. 26th International Conference on
* Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark,
* 2014.
*
*
* @author Arthur Zimek
*/
@Title("Data Perturbation for Outlier Detection Ensembles")
@Description("A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).")
@Reference(authors = "A. Zimek, R. J. G. B. Campello, J. Sander",//
title = "Data Perturbation for Outlier Detection Ensembles", //
booktitle = "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", //
url = "http://dx.doi.org/10.1145/2618243.2618257")
public class PerturbationFilter extends AbstractVectorConversionFilter {
/**
* Class logger
*/
private static final Logging LOG = Logging.getLogger(PerturbationFilter.class);
/**
* Scaling reference options.
*
* @author Arthur Zimek
*
* @apiviz.exclude
*/
public static enum ScalingReference {
UNITCUBE, STDDEV, MINMAX
}
/**
* Nature of the noise distribution.
*
* @author Arthur Zimek
*
* @apiviz.exclude
*/
public static enum NoiseDistribution {
GAUSSIAN, UNIFORM
}
/**
* Which reference to use for scaling the noise.
*/
private ScalingReference scalingreference;
/**
* Nature of the noise distribution.
*/
private NoiseDistribution noisedistribution;
/**
* Random object to generate the attribute-wise seeds for the noise.
*/
private final Random RANDOM;
/**
* Percentage of the variance of the random noise generation, given the
* variance of the corresponding attribute in the data.
*/
private double percentage;
/**
* Temporary storage used during initialization.
*/
private MeanVarianceMinMax[] mvs = null;
/**
* Stores the scaling reference in each dimension.
*/
private double[] scalingreferencevalues = new double[0];
/**
* The random objects to generate noise distributions independently for each
* attribute.
*/
private Random[] randomPerAttribute = null;
/**
* Stores the maximum in each dimension.
*/
private double[] maxima;
/**
* Stores the minimum in each dimension.
*/
private double[] minima;
/**
* Stores the dimensionality from the preprocessing.
*/
private int dimensionality = 0;
/**
* Constructor.
*
* @param seed Seed value, may be {@code null} for a random seed.
* @param percentage Relative amount of jitter to add
* @param scalingreference Scaling reference
* @param minima Preset minimum values. May be {@code null}.
* @param maxima Preset maximum values. May be {@code null}.
* @param noisedistribution Nature of the noise distribution.
*/
public PerturbationFilter(Long seed, double percentage, ScalingReference scalingreference, double[] minima, double[] maxima, NoiseDistribution noisedistribution) {
super();
this.percentage = percentage;
this.scalingreference = scalingreference;
this.minima = minima;
this.maxima = maxima;
this.noisedistribution = noisedistribution;
this.RANDOM = (seed == null) ? new Random() : new Random(seed);
}
@Override
protected boolean prepareStart(SimpleTypeInformation in) {
if(scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) {
dimensionality = minima.length;
scalingreferencevalues = new double[dimensionality];
randomPerAttribute = new Random[dimensionality];
for(int d = 0; d < dimensionality; d++) {
scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage;
if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
scalingreferencevalues[d] = percentage;
}
randomPerAttribute[d] = new Random(RANDOM.nextLong());
}
return false;
}
if(scalingreference == ScalingReference.UNITCUBE) {
return false;
}
return (scalingreferencevalues.length == 0);
}
@Override
protected void prepareProcessInstance(V featureVector) {
// First object? Then init. (We didn't have a dimensionality before!)
if(mvs == null) {
dimensionality = featureVector.getDimensionality();
mvs = MeanVarianceMinMax.newArray(dimensionality);
}
for(int d = 0; d < featureVector.getDimensionality(); d++) {
mvs[d].put(featureVector.doubleValue(d));
}
}
@Override
protected void prepareComplete() {
StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
scalingreferencevalues = new double[dimensionality];
randomPerAttribute = new Random[dimensionality];
if(scalingreference == ScalingReference.STDDEV) {
if(buf != null) {
buf.append("Standard deviation per attribute: ");
}
for(int d = 0; d < dimensionality; d++) {
scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
scalingreferencevalues[d] = percentage;
}
randomPerAttribute[d] = new Random(RANDOM.nextLong());
if(buf != null) {
buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
}
}
}
else if(scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) {
if(buf != null) {
buf.append("extension per attribute: ");
}
for(int d = 0; d < dimensionality; d++) {
scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
scalingreferencevalues[d] = percentage;
}
randomPerAttribute[d] = new Random(RANDOM.nextLong());
if(buf != null) {
buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
}
}
}
mvs = null;
if(buf != null) {
LOG.debugFine(buf.toString());
}
}
@Override
protected SimpleTypeInformation super V> getInputTypeRestriction() {
return TypeUtil.NUMBER_VECTOR_FIELD;
}
@Override
protected V filterSingleObject(V featureVector) {
if(scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) {
dimensionality = featureVector.getDimensionality();
scalingreferencevalues = new double[dimensionality];
randomPerAttribute = new Random[dimensionality];
for(int d = 0; d < dimensionality; d++) {
scalingreferencevalues[d] = percentage;
randomPerAttribute[d] = new Random(RANDOM.nextLong());
}
}
if(scalingreferencevalues.length != featureVector.getDimensionality()) {
throw new IllegalArgumentException("FeatureVectors and given Minima/Maxima differ in length.");
}
double[] values = new double[featureVector.getDimensionality()];
for(int d = 0; d < featureVector.getDimensionality(); d++) {
if(this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) {
values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d];
}
else if(this.noisedistribution.equals(NoiseDistribution.UNIFORM)) {
values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d];
}
}
return factory.newNumberVector(values);
}
@Override
protected SimpleTypeInformation super V> convertedType(SimpleTypeInformation in) {
initializeOutputType(in);
return in;
}
@Override
protected Logging getLogger() {
return LOG;
}
/**
* Parameterization class.
*
* @author Arthur Zimek
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* Parameter for minimum.
*/
public static final OptionID MINIMA_ID = new OptionID("perturbationfilter.min", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
/**
* Parameter for maximum.
*/
public static final OptionID MAXIMA_ID = new OptionID("perturbationfilter.max", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
/**
* Stores the maximum in each dimension.
*/
private double[] maxima = new double[0];
/**
* Stores the minimum in each dimension.
*/
private double[] minima = new double[0];
/**
* Optional parameter to specify a seed for random Gaussian noise
* generation. If unused, system time is used as seed.
*
* Key: {@code -perturbationfilter.seed}
*
*/
public static final OptionID SEED_ID = new OptionID("perturbationfilter.seed", "Seed for random noise generation.");
/**
* Seed for randomly shuffling the rows of the database. If null, system
* time is used as seed.
*/
protected Long seed = null;
/**
* Optional parameter to specify a percentage of the standard deviation of
* the random Gaussian noise generation, given the standard deviation of the
* corresponding attribute in the original data distribution (assuming a
* Gaussian there).
*
*
* Key: {@code -perturbationfilter.percentage}
*
*
* Default: 0.01
*
*
* Constraint: 0 < percentage ≤1
*
*/
public static final OptionID PERCENTAGE_ID = new OptionID("perturbationfilter.percentage", "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there).");
/**
* Parameter for selecting scaling reference.
*
* Key: {@code -perturbationfilter.scalingreference}
*
*
* Default: ScalingReference.UNITCUBE
*
*/
public static final OptionID SCALINGREFERENCE_ID = new OptionID("perturbationfilter.scalingreference", "The reference for scaling the Gaussian noise. Default is " + ScalingReference.UNITCUBE + ", parameter " + PERCENTAGE_ID.getName() + " will then directly define the standard deviation of all noise Gaussians. For options " + ScalingReference.STDDEV + " and " + ScalingReference.MINMAX + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians.");
/**
* Parameter for selecting the noise distribution.
*
*
* Key: {@code -perturbationfilter.noisedistribution}
*
*
* Default: NoiseDistribution.UNIFORM
*
*
*/
public static final OptionID NOISEDISTRIBUTION_ID = new OptionID("perturbationfilter.noisedistribution", "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM);
/**
* Percentage of the variance of the random Gaussian noise generation or of
* the range of the uniform distribution, given the variance of the
* corresponding attribute in the data.
*/
protected double percentage;
/**
* The option which reference to use for scaling the noise.
*/
protected ScalingReference scalingreference;
/**
* The option which nature of noise distribution to choose.
*/
protected NoiseDistribution noisedistribution;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
EnumParameter scalingReferenceP = new EnumParameter<>(SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE);
if(config.grab(scalingReferenceP)) {
scalingreference = scalingReferenceP.getValue();
}
EnumParameter noisedistributionP = new EnumParameter<>(NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM);
if(config.grab(noisedistributionP)) {
noisedistribution = noisedistributionP.getValue();
}
DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01);
percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
if(config.grab(percentageP)) {
percentage = percentageP.getValue();
}
LongParameter seedP = new LongParameter(SEED_ID);
seedP.setOptional(true);
if(config.grab(seedP)) {
seed = seedP.getValue();
}
DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID);
minimaP.setOptional(true);
if(config.grab(minimaP)) {
minima = minimaP.getValue().clone();
}
DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID);
maximaP.setOptional(true);
if(config.grab(maximaP)) {
maxima = maximaP.getValue().clone();
}
config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP));
config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP));
}
@Override
protected PerturbationFilter makeInstance() {
return new PerturbationFilter<>(seed, percentage, scalingreference, minima, maxima, noisedistribution);
}
}
}