1 files changed, 217 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
new file mode 100644
index 00000000..ec50aadd
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
@@ -0,0 +1,217 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+
+/**
+ * Median Absolute Deviation is used for scaling the data set as follows:
+ * 
+ * First, the median, and median absolute deviation are computed in each axis.
+ * Then, each value is projected to (x - median(X)) / MAD(X).
+ * 
+ * This is similar to z-standardization of data sets, except that it is more
+ * robust towards outliers, and only slightly more expensive to compute.
+ * 
+ * @author Erich Schubert
+ * @param <V> vector type
+ * 
+ * @apiviz.uses NumberVector
+ */
+// TODO: extract superclass AbstractAttributeWiseNormalization
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMADNormalization"})
+public class AttributeWiseMADNormalization<V extends NumberVector> implements Normalization<V> {
+  /**
+   * Class logger.
+   */
+  private static final Logging LOG = Logging.getLogger(AttributeWiseMADNormalization.class);
+
+  /**
+   * Number vector factory.
+   */
+  protected NumberVector.Factory<V> factory;
+
+  /**
+   * Stores the median in each dimension.
+   */
+  private double[] median = new double[0];
+
+  /**
+   * Stores the inverse median absolute deviation in each dimension.
+   */
+  private double[] imadsigma = new double[0];
+
+  /**
+   * Constructor.
+   */
+  public AttributeWiseMADNormalization() {
+    super();
+  }
+
+  @Override
+  public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+    if(objects.dataLength() == 0) {
+      return objects;
+    }
+    for(int r = 0; r < objects.metaLength(); r++) {
+      SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
+      final List<?> column = (List<?>) objects.getColumn(r);
+      if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+        continue;
+      }
+      @SuppressWarnings("unchecked")
+      final List<V> castColumn = (List<V>) column;
+      // Get the replacement type information
+      @SuppressWarnings("unchecked")
+      final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+      factory = FilterUtil.guessFactory(castType);
+
+      // Scan to find the best
+      final int dim = castType.getDimensionality();
+      median = new double[dim];
+      imadsigma = new double[dim];
+      // Scratch space for testing:
+      double[] test = new double[castColumn.size()];
+
+      FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data.", dim, LOG) : null;
+      // We iterate over dimensions, this kind of filter needs fast random
+      // access.
+      for(int d = 0; d < dim; d++) {
+        for(int i = 0; i < test.length; i++) {
+          test[i] = castColumn.get(i).doubleValue(d);
+        }
+        final double med = QuickSelect.median(test);
+        median[d] = med;
+        int zeros = 0;
+        for(int i = 0; i < test.length; i++) {
+          test[i] = Math.abs(test[i] - med);
+          if(test[i] == 0.) {
+            zeros++;
+          }
+        }
+        // Rescale the true MAD for the best standard deviation estimate:
+        if(zeros < (test.length >>> 1)) {
+          imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
+        }
+        else if(zeros == test.length) {
+          LOG.warning("Constant attribute detected. Using MAD=1.");
+          imadsigma[d] = 1.; // Does not matter. Constant distribution.
+        }
+        else {
+          // We have more than 50% zeros, so the regular MAD estimate does not
+          // work. Generalize the MAD approach to use the 50% non-zero value:
+          final int rank = zeros + ((test.length - zeros) >> 1);
+          final double rel = .5 + rank * .5 / test.length;
+          imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
+          LOG.warning("Near-constant attribute detected. Using modified MAD.");
+        }
+        LOG.incrementProcessed(dprog);
+      }
+      LOG.ensureCompleted(dprog);
+
+      FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), LOG) : null;
+      // Normalization scan
+      double[] buf = new double[dim];
+      for(int i = 0; i < objects.dataLength(); i++) {
+        final V obj = castColumn.get(i);
+        for(int d = 0; d < dim; d++) {
+          buf[d] = normalize(d, obj.doubleValue(d));
+        }
+        castColumn.set(i, factory.newNumberVector(buf));
+        LOG.incrementProcessed(nprog);
+      }
+      LOG.ensureCompleted(nprog);
+    }
+    return objects;
+  }
+
+  @Override
+  public V restore(V featureVector) throws NonNumericFeaturesException {
+    if(featureVector.getDimensionality() == median.length) {
+      double[] values = new double[featureVector.getDimensionality()];
+      for(int d = 0; d < featureVector.getDimensionality(); d++) {
+        values[d] = restore(d, featureVector.doubleValue(d));
+      }
+      return factory.newNumberVector(values);
+    }
+    else {
+      throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + median.length);
+    }
+  }
+
+  @Override
+  public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) throws NonNumericFeaturesException {
+    throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+  }
+
+  /**
+   * Normalize a single dimension.
+   * 
+   * @param d Dimension
+   * @param val Value
+   * @return Normalized value
+   */
+  private double normalize(int d, double val) {
+    return (val - median[d]) * imadsigma[d];
+  }
+
+  /**
+   * Restore a single dimension.
+   * 
+   * @param d Dimension
+   * @param val Value
+   * @return Normalized value
+   */
+  private double restore(int d, double val) {
+    return (val / imadsigma[d]) + median[d];
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder();
+    result.append("normalization class: ").append(getClass().getName());
+    result.append('\n');
+    result.append("normalization median: ").append(FormatUtil.format(median));
+    result.append('\n');
+    result.append("normalization scaling factor: ").append(FormatUtil.format(imadsigma));
+    return result.toString();
+  }
+}