diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java | 436 |
1 files changed, 436 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java new file mode 100644 index 00000000..4e5fe9b3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java @@ -0,0 +1,436 @@ +package de.lmu.ifi.dbs.elki.datasource.filter.transform; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorConversionFilter; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.MeanVarianceMinMax; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeSetGlobalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.EqualSizeGlobalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; + +/** + * A filter to perturb the values by adding micro-noise. + * + * The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a + * specified standard deviation or by a uniform distribution with a specified + * range. The standard deviation or the range can be scaled, attribute-wise, to + * a given percentage of the original standard deviation in the data + * distribution (assuming a Gaussian distribution there), or to a percentage of + * the extension in each attribute ({@code maximumValue - minimumValue}). + * + * This filter has a potentially wide use but has been implemented for the following publication: + * + * Reference: + * <p> + * A. Zimek, R. J. G. B. Campello, J. Sander:</br> + * Data Perturbation for Outlier Detection Ensembles.<\br> + * In: Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014. + * </p> + * + * @author Arthur Zimek + */ +@Title("Data Perturbation for Outlier Detection Ensembles") +@Description("A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).") +@Reference(authors = "A. Zimek, R. J. G. B. Campello, J. Sander",// +title = "Data Perturbation for Outlier Detection Ensembles", // +booktitle = "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", // +url = "http://dx.doi.org/10.1145/2618243.2618257") +public class PerturbationFilter<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> { + /** + * Class logger + */ + private static final Logging LOG = Logging.getLogger(PerturbationFilter.class); + + /** + * Scaling reference options. + * + * @author Arthur Zimek + * + * @apiviz.exclude + */ + public static enum ScalingReference { + UNITCUBE, STDDEV, MINMAX + } + + /** + * Nature of the noise distribution. + * + * @author Arthur Zimek + * + * @apiviz.exclude + */ + public static enum NoiseDistribution { + GAUSSIAN, UNIFORM + } + + /** + * Which reference to use for scaling the noise. + */ + private ScalingReference scalingreference; + + /** + * Nature of the noise distribution. + */ + private NoiseDistribution noisedistribution; + + /** + * Random object to generate the attribute-wise seeds for the noise. + */ + private final Random RANDOM; + + /** + * Percentage of the variance of the random noise generation, given the + * variance of the corresponding attribute in the data. + */ + private double percentage; + + /** + * Temporary storage used during initialization. + */ + private MeanVarianceMinMax[] mvs = null; + + /** + * Stores the scaling reference in each dimension. + */ + private double[] scalingreferencevalues = new double[0]; + + /** + * The random objects to generate noise distributions independently for each + * attribute. + */ + private Random[] randomPerAttribute = null; + + /** + * Stores the maximum in each dimension. + */ + private double[] maxima; + + /** + * Stores the minimum in each dimension. + */ + private double[] minima; + + /** + * Stores the dimensionality from the preprocessing. + */ + private int dimensionality = 0; + + /** + * Constructor. + * + * @param seed Seed value, may be {@code null} for a random seed. + * @param percentage Relative amount of jitter to add + * @param scalingreference Scaling reference + * @param minima Preset minimum values. May be {@code null}. + * @param maxima Preset maximum values. May be {@code null}. + * @param noisedistribution Nature of the noise distribution. + */ + public PerturbationFilter(Long seed, double percentage, ScalingReference scalingreference, double[] minima, double[] maxima, NoiseDistribution noisedistribution) { + super(); + this.percentage = percentage; + this.scalingreference = scalingreference; + this.minima = minima; + this.maxima = maxima; + this.noisedistribution = noisedistribution; + this.RANDOM = (seed == null) ? new Random() : new Random(seed); + } + + @Override + protected boolean prepareStart(SimpleTypeInformation<V> in) { + if(scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) { + dimensionality = minima.length; + scalingreferencevalues = new double[dimensionality]; + randomPerAttribute = new Random[dimensionality]; + for(int d = 0; d < dimensionality; d++) { + scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage; + if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { + scalingreferencevalues[d] = percentage; + } + randomPerAttribute[d] = new Random(RANDOM.nextLong()); + } + return false; + } + if(scalingreference == ScalingReference.UNITCUBE) { + return false; + } + return (scalingreferencevalues.length == 0); + } + + @Override + protected void prepareProcessInstance(V featureVector) { + // First object? Then init. (We didn't have a dimensionality before!) + if(mvs == null) { + dimensionality = featureVector.getDimensionality(); + mvs = MeanVarianceMinMax.newArray(dimensionality); + } + for(int d = 0; d < featureVector.getDimensionality(); d++) { + mvs[d].put(featureVector.doubleValue(d)); + } + } + + @Override + protected void prepareComplete() { + StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null; + scalingreferencevalues = new double[dimensionality]; + randomPerAttribute = new Random[dimensionality]; + if(scalingreference == ScalingReference.STDDEV) { + if(buf != null) { + buf.append("Standard deviation per attribute: "); + } + for(int d = 0; d < dimensionality; d++) { + scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage; + if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { + scalingreferencevalues[d] = percentage; + } + randomPerAttribute[d] = new Random(RANDOM.nextLong()); + if(buf != null) { + buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); + } + } + } + else if(scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) { + if(buf != null) { + buf.append("extension per attribute: "); + } + for(int d = 0; d < dimensionality; d++) { + scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage; + if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) { + scalingreferencevalues[d] = percentage; + } + randomPerAttribute[d] = new Random(RANDOM.nextLong()); + if(buf != null) { + buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage); + } + } + } + mvs = null; + if(buf != null) { + LOG.debugFine(buf.toString()); + } + } + + @Override + protected SimpleTypeInformation<? super V> getInputTypeRestriction() { + return TypeUtil.NUMBER_VECTOR_FIELD; + } + + @Override + protected V filterSingleObject(V featureVector) { + if(scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) { + dimensionality = featureVector.getDimensionality(); + scalingreferencevalues = new double[dimensionality]; + randomPerAttribute = new Random[dimensionality]; + for(int d = 0; d < dimensionality; d++) { + scalingreferencevalues[d] = percentage; + randomPerAttribute[d] = new Random(RANDOM.nextLong()); + } + } + if(scalingreferencevalues.length != featureVector.getDimensionality()) { + throw new IllegalArgumentException("FeatureVectors and given Minima/Maxima differ in length."); + } + double[] values = new double[featureVector.getDimensionality()]; + for(int d = 0; d < featureVector.getDimensionality(); d++) { + if(this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) { + values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d]; + } + else if(this.noisedistribution.equals(NoiseDistribution.UNIFORM)) { + values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d]; + } + } + return factory.newNumberVector(values); + } + + @Override + protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) { + initializeOutputType(in); + return in; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Arthur Zimek + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * Parameter for minimum. + */ + public static final OptionID MINIMA_ID = new OptionID("perturbationfilter.min", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken."); + + /** + * Parameter for maximum. + */ + public static final OptionID MAXIMA_ID = new OptionID("perturbationfilter.max", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken."); + + /** + * Stores the maximum in each dimension. + */ + private double[] maxima = new double[0]; + + /** + * Stores the minimum in each dimension. + */ + private double[] minima = new double[0]; + + /** + * Optional parameter to specify a seed for random Gaussian noise + * generation. If unused, system time is used as seed. + * <p> + * Key: {@code -perturbationfilter.seed} + * </p> + */ + public static final OptionID SEED_ID = new OptionID("perturbationfilter.seed", "Seed for random noise generation."); + + /** + * Seed for randomly shuffling the rows of the database. If null, system + * time is used as seed. + */ + protected Long seed = null; + + /** + * Optional parameter to specify a percentage of the standard deviation of + * the random Gaussian noise generation, given the standard deviation of the + * corresponding attribute in the original data distribution (assuming a + * Gaussian there). + * + * <p> + * Key: {@code -perturbationfilter.percentage} + * </p> + * <p> + * Default: <code>0.01</code> + * </p> + * <p> + * Constraint: 0 < percentage ≤1 + * </p> + */ + public static final OptionID PERCENTAGE_ID = new OptionID("perturbationfilter.percentage", "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there)."); + + /** + * Parameter for selecting scaling reference. + * <p> + * Key: {@code -perturbationfilter.scalingreference} + * </p> + * <p> + * Default: <code>ScalingReference.UNITCUBE</code> + * </p> + */ + public static final OptionID SCALINGREFERENCE_ID = new OptionID("perturbationfilter.scalingreference", "The reference for scaling the Gaussian noise. Default is " + ScalingReference.UNITCUBE + ", parameter " + PERCENTAGE_ID.getName() + " will then directly define the standard deviation of all noise Gaussians. For options " + ScalingReference.STDDEV + " and " + ScalingReference.MINMAX + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians."); + + /** + * Parameter for selecting the noise distribution. + * + * <p> + * Key: {@code -perturbationfilter.noisedistribution} + * </p> + * <p> + * Default: <code>NoiseDistribution.UNIFORM</code> + * </p> + * + */ + public static final OptionID NOISEDISTRIBUTION_ID = new OptionID("perturbationfilter.noisedistribution", "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM); + + /** + * Percentage of the variance of the random Gaussian noise generation or of + * the range of the uniform distribution, given the variance of the + * corresponding attribute in the data. + */ + protected double percentage; + + /** + * The option which reference to use for scaling the noise. + */ + protected ScalingReference scalingreference; + + /** + * The option which nature of noise distribution to choose. + */ + protected NoiseDistribution noisedistribution; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + EnumParameter<ScalingReference> scalingReferenceP = new EnumParameter<>(SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE); + if(config.grab(scalingReferenceP)) { + scalingreference = scalingReferenceP.getValue(); + } + EnumParameter<NoiseDistribution> noisedistributionP = new EnumParameter<>(NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM); + if(config.grab(noisedistributionP)) { + noisedistribution = noisedistributionP.getValue(); + } + DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01); + percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); + if(config.grab(percentageP)) { + percentage = percentageP.getValue(); + } + LongParameter seedP = new LongParameter(SEED_ID); + seedP.setOptional(true); + if(config.grab(seedP)) { + seed = seedP.getValue(); + } + DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID); + minimaP.setOptional(true); + if(config.grab(minimaP)) { + minima = ArrayLikeUtil.toPrimitiveDoubleArray(minimaP.getValue()); + } + DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID); + maximaP.setOptional(true); + if(config.grab(maximaP)) { + maxima = ArrayLikeUtil.toPrimitiveDoubleArray(maximaP.getValue()); + } + + config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP)); + config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP)); + } + + @Override + protected PerturbationFilter<V> makeInstance() { + return new PerturbationFilter<>(seed, percentage, scalingreference, minima, maxima, noisedistribution); + } + } +} |