summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java217
1 files changed, 217 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
new file mode 100644
index 00000000..ec50aadd
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
@@ -0,0 +1,217 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+
+/**
+ * Median Absolute Deviation is used for scaling the data set as follows:
+ *
+ * First, the median, and median absolute deviation are computed in each axis.
+ * Then, each value is projected to (x - median(X)) / MAD(X).
+ *
+ * This is similar to z-standardization of data sets, except that it is more
+ * robust towards outliers, and only slightly more expensive to compute.
+ *
+ * @author Erich Schubert
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ */
+// TODO: extract superclass AbstractAttributeWiseNormalization
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMADNormalization"})
+public class AttributeWiseMADNormalization<V extends NumberVector> implements Normalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseMADNormalization.class);
+
+ /**
+ * Number vector factory.
+ */
+ protected NumberVector.Factory<V> factory;
+
+ /**
+ * Stores the median in each dimension.
+ */
+ private double[] median = new double[0];
+
+ /**
+ * Stores the inverse median absolute deviation in each dimension.
+ */
+ private double[] imadsigma = new double[0];
+
+ /**
+ * Constructor.
+ */
+ public AttributeWiseMADNormalization() {
+ super();
+ }
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ if(objects.dataLength() == 0) {
+ return objects;
+ }
+ for(int r = 0; r < objects.metaLength(); r++) {
+ SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
+ final List<?> column = (List<?>) objects.getColumn(r);
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ continue;
+ }
+ @SuppressWarnings("unchecked")
+ final List<V> castColumn = (List<V>) column;
+ // Get the replacement type information
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+ factory = FilterUtil.guessFactory(castType);
+
+ // Scan to find the best
+ final int dim = castType.getDimensionality();
+ median = new double[dim];
+ imadsigma = new double[dim];
+ // Scratch space for testing:
+ double[] test = new double[castColumn.size()];
+
+ FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data.", dim, LOG) : null;
+ // We iterate over dimensions, this kind of filter needs fast random
+ // access.
+ for(int d = 0; d < dim; d++) {
+ for(int i = 0; i < test.length; i++) {
+ test[i] = castColumn.get(i).doubleValue(d);
+ }
+ final double med = QuickSelect.median(test);
+ median[d] = med;
+ int zeros = 0;
+ for(int i = 0; i < test.length; i++) {
+ test[i] = Math.abs(test[i] - med);
+ if(test[i] == 0.) {
+ zeros++;
+ }
+ }
+ // Rescale the true MAD for the best standard deviation estimate:
+ if(zeros < (test.length >>> 1)) {
+ imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
+ }
+ else if(zeros == test.length) {
+ LOG.warning("Constant attribute detected. Using MAD=1.");
+ imadsigma[d] = 1.; // Does not matter. Constant distribution.
+ }
+ else {
+ // We have more than 50% zeros, so the regular MAD estimate does not
+ // work. Generalize the MAD approach to use the 50% non-zero value:
+ final int rank = zeros + ((test.length - zeros) >> 1);
+ final double rel = .5 + rank * .5 / test.length;
+ imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
+ LOG.warning("Near-constant attribute detected. Using modified MAD.");
+ }
+ LOG.incrementProcessed(dprog);
+ }
+ LOG.ensureCompleted(dprog);
+
+ FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), LOG) : null;
+ // Normalization scan
+ double[] buf = new double[dim];
+ for(int i = 0; i < objects.dataLength(); i++) {
+ final V obj = castColumn.get(i);
+ for(int d = 0; d < dim; d++) {
+ buf[d] = normalize(d, obj.doubleValue(d));
+ }
+ castColumn.set(i, factory.newNumberVector(buf));
+ LOG.incrementProcessed(nprog);
+ }
+ LOG.ensureCompleted(nprog);
+ }
+ return objects;
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ if(featureVector.getDimensionality() == median.length) {
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+ else {
+ throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + median.length);
+ }
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) throws NonNumericFeaturesException {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ /**
+ * Normalize a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double normalize(int d, double val) {
+ return (val - median[d]) * imadsigma[d];
+ }
+
+ /**
+ * Restore a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double restore(int d, double val) {
+ return (val / imadsigma[d]) + median[d];
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization median: ").append(FormatUtil.format(median));
+ result.append('\n');
+ result.append("normalization scaling factor: ").append(FormatUtil.format(imadsigma));
+ return result.toString();
+ }
+}