summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/filter/normalization')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java77
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java326
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java)171
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java)28
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java)81
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java207
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java)58
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java)96
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java)83
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java)19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java97
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java159
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java177
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java)37
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java119
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java2
22 files changed, 1488 insertions, 337 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
index 0b4d7ae0..8ad13355 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,9 +33,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
*
* @author Elke Achtert
*
- * @param <O> Object type processed
+ * @param <V> Object type processed
*/
-public abstract class AbstractNormalization<O extends NumberVector<?>> extends AbstractVectorConversionFilter<O, O> implements Normalization<O> {
+public abstract class AbstractNormalization<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> implements Normalization<V> {
/**
* Initializes the option handler and the parameter map.
*/
@@ -44,12 +44,18 @@ public abstract class AbstractNormalization<O extends NumberVector<?>> extends A
}
@Override
- protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<O> in) {
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
initializeOutputType(in);
return in;
}
@Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ // FIXME: implement everywhere.
+ throw new UnsupportedOperationException("Not implemented yet.");
+ }
+
+ @Override
public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
// FIXME: implement.
throw new UnsupportedOperationException("Not yet implemented!");
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
index 54fc7794..38e0bf31 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,9 +33,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
*
* @author Erich Schubert
*
- * @param <O> Object type processed
+ * @param <V> Object type processed
*/
-public abstract class AbstractStreamNormalization<O extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<O, O> implements Normalization<O> {
+public abstract class AbstractStreamNormalization<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> implements Normalization<V> {
/**
* Initializes the option handler and the parameter map.
*/
@@ -44,12 +44,17 @@ public abstract class AbstractStreamNormalization<O extends NumberVector<?>> ext
}
@Override
- protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<O> in) {
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
initializeOutputType(in);
return in;
}
@Override
+ public V restore(V featureVector) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
// FIXME: implement.
throw new UnsupportedOperationException("Not yet implemented!");
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
index 0abaac95..d9002c93 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
index 3c3e7bdf..bf913852 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
/**
* Normalization performs a normalization on a set of feature vectors and is
@@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
*
* @param <O> object type
*/
-public interface Normalization<O> extends ObjectFilter, Parameterizable {
+public interface Normalization<O> extends ObjectFilter {
/**
* Transforms a feature vector to the original attribute ranges.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
deleted file mode 100644
index 09b73aa4..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import gnu.trove.map.hash.TIntDoubleHashMap;
-import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-
-/**
- * Perform full TF-IDF Normalization as commonly used in text mining.
- *
- * Each record is first normalized using "term frequencies" to sum up to 1. Then
- * it is globally normalized using the Inverse Document Frequency, so rare terms
- * are weighted stronger than common terms.
- *
- * Restore will only undo the IDF part of the normalization!
- *
- * @author Erich Schubert
- *
- * @param <V> Vector type
- */
-public class TFIDFNormalization<V extends SparseNumberVector<?>> extends InverseDocumentFrequencyNormalization<V> {
- /**
- * Class logger.
- */
- private static final Logging LOG = Logging.getLogger(TFIDFNormalization.class);
-
- /**
- * Constructor.
- */
- public TFIDFNormalization() {
- super();
- }
-
- @Override
- protected V filterSingleObject(V featureVector) {
- double sum = 0.0;
- for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) {
- sum += featureVector.iterDoubleValue(it);
- }
- if(sum <= 0) {
- sum = 1.0;
- }
- TIntDoubleHashMap vals = new TIntDoubleHashMap();
- for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) {
- final int dim = featureVector.iterDim(it);
- vals.put(dim, featureVector.iterDoubleValue(it) / sum * idf.get(dim));
- }
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java
new file mode 100644
index 00000000..a1618b9f
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java
@@ -0,0 +1,326 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.BetaDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta.BestFitEstimator;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParameter;
+
+/**
+ * Project the data using a Beta distribution.
+ *
+ * This is a crude heuristic, that may or may not work for your data set. There
+ * currently is no theoretical foundation of why it may be sensible or not to do
+ * this.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ * @apiviz.uses DistributionEstimator
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseBetaNormalization"})
+public class AttributeWiseBetaNormalization<V extends NumberVector> implements Normalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseBetaNormalization.class);
+
+ /**
+ * Stores the distribution estimators
+ */
+ private List<DistributionEstimator<?>> estimators;
+
+ /**
+ * Stores the estimated distributions
+ */
+ private List<Distribution> dists;
+
+ /**
+ * Number vector factory.
+ */
+ protected NumberVector.Factory<V> factory;
+
+ /**
+ * Expected outlier rate alpha.
+ */
+ protected double alpha = 0.01;
+
+ /**
+ * Constructor.
+ *
+ * @param estimators Distribution estimators
+ */
+ public AttributeWiseBetaNormalization(List<DistributionEstimator<?>> estimators, double alpha) {
+ super();
+ this.estimators = estimators;
+ this.alpha = alpha;
+ }
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ if(objects.dataLength() == 0) {
+ return objects;
+ }
+ for(int r = 0; r < objects.metaLength(); r++) {
+ SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
+ final List<?> column = (List<?>) objects.getColumn(r);
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ continue;
+ }
+ @SuppressWarnings("unchecked")
+ final List<V> castColumn = (List<V>) column;
+ // Get the replacement type information
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+ factory = FilterUtil.guessFactory(castType);
+
+ // Scan to find the best
+ final int dim = castType.getDimensionality();
+ dists = new ArrayList<>(dim);
+ // Scratch space for testing:
+ double[] test = new double[castColumn.size()];
+
+ // We iterate over dimensions, this kind of filter needs fast random
+ // access.
+ Adapter adapter = new Adapter();
+ for(int d = 0; d < dim; d++) {
+ adapter.dim = d;
+ if(estimators.size() == 1) {
+ dists.add(estimators.get(0).estimate(castColumn, adapter));
+ continue;
+ }
+ Distribution best = null;
+ double bestq = Double.POSITIVE_INFINITY;
+ trials: for(DistributionEstimator<?> est : estimators) {
+ try {
+ Distribution dist = est.estimate(castColumn, adapter);
+ for(int i = 0; i < test.length; i++) {
+ test[i] = dist.cdf(castColumn.get(i).doubleValue(d));
+ if(Double.isNaN(test[i])) {
+ LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ if(Double.isInfinite(test[i])) {
+ LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ }
+ Arrays.sort(test);
+ double q = KolmogorovSmirnovTest.simpleTest(test);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
+ }
+ if(best == null || q < bestq) {
+ best = dist;
+ bestq = q;
+ }
+ }
+ catch(ArithmeticException e) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
+ }
+ continue;
+ }
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
+ }
+ dists.add(best);
+ }
+
+ // Beta distribution for projection
+ double p = Math.pow(alpha, -1 / Math.sqrt(dim));
+ BetaDistribution beta = new BetaDistribution(p, p);
+ // Normalization scan
+ double[] buf = new double[dim];
+ for(int i = 0; i < objects.dataLength(); i++) {
+ final V obj = castColumn.get(i);
+ for(int d = 0; d < dim; d++) {
+ // TODO: when available, use logspace for better numerical precision!
+ buf[d] = beta.quantile(dists.get(d).cdf(obj.doubleValue(d)));
+ }
+ castColumn.set(i, factory.newNumberVector(buf));
+ }
+ }
+ return objects;
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization distributions: ");
+ boolean first = true;
+ for(DistributionEstimator<?> est : estimators) {
+ if(!first) {
+ result.append(',');
+ }
+ first = false;
+ result.append(est.getClass().getSimpleName());
+ }
+ return result.toString();
+ }
+
+ /**
+ * Array adapter class for vectors.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector>> {
+ /**
+ * Dimension to process.
+ */
+ int dim;
+
+ @Override
+ public int size(List<? extends NumberVector> array) {
+ return array.size();
+ }
+
+ @Override
+ public Double get(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return getDouble(array, off);
+ }
+
+ @Override
+ public double getDouble(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).doubleValue(dim);
+ }
+
+ @Override
+ public float getFloat(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).floatValue(dim);
+ }
+
+ @Override
+ public int getInteger(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).intValue(dim);
+ }
+
+ @Override
+ public short getShort(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).shortValue(dim);
+ }
+
+ @Override
+ public long getLong(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).longValue(dim);
+ }
+
+ @Override
+ public byte getByte(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).byteValue(dim);
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for distribution estimators.
+ */
+ public static final OptionID DISTRIBUTIONS_ID = new OptionID("betanormalize.distributions", "A list of the distribution estimators to try.");
+
+ /**
+ * Shape parameter.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("betanormalize.alpha", "Alpha parameter to control the shape of the output distribution.");
+
+ /**
+ * Stores the distribution estimators
+ */
+ private List<DistributionEstimator<?>> estimators;
+
+ /**
+ * Expected outlier rate alpha.
+ */
+ private double alpha;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectListParameter<DistributionEstimator<?>> estP = new ObjectListParameter<>(DISTRIBUTIONS_ID, DistributionEstimator.class);
+ List<Class<? extends DistributionEstimator<?>>> def = new ArrayList<>(1);
+ def.add(BestFitEstimator.class);
+ estP.setDefaultValue(def);
+ if(config.grab(estP)) {
+ estimators = estP.instantiateClasses(config);
+ }
+
+ DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.1);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+ }
+
+ @Override
+ protected AttributeWiseBetaNormalization<V> makeInstance() {
+ return new AttributeWiseBetaNormalization<>(estimators, alpha);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java
index dd86cc5a..be501b11 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,12 +33,16 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta.BestFitEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -66,7 +70,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParamet
* @apiviz.uses DistributionEstimator
*/
// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements Normalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseCDFNormalization"})
+public class AttributeWiseCDFNormalization<V extends NumberVector> implements Normalization<V> {
/**
* Class logger.
*/
@@ -85,7 +90,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
/**
* Number vector factory.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
/**
* Constructor.
@@ -99,13 +104,13 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (objects.dataLength() == 0) {
+ if(objects.dataLength() == 0) {
return objects;
}
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked")
@@ -119,60 +124,33 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
final int dim = castType.getDimensionality();
dists = new ArrayList<>(dim);
// Scratch space for testing:
- double[] test = new double[castColumn.size()];
+ double[] test = estimators.size() > 1 ? new double[castColumn.size()] : null;
// We iterate over dimensions, this kind of filter needs fast random
// access.
Adapter adapter = new Adapter();
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
adapter.dim = d;
- if (estimators.size() == 1) {
- dists.add(estimators.get(0).estimate(castColumn, adapter));
- } else {
- Distribution best = null;
- double bestq = Double.POSITIVE_INFINITY;
- trials: for (DistributionEstimator<?> est : estimators) {
- try {
- Distribution dist = est.estimate(castColumn, adapter);
- for (int i = 0; i < test.length; i++) {
- test[i] = dist.cdf(castColumn.get(i).doubleValue(d));
- if (Double.isNaN(test[i])) {
- LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
- continue trials;
- }
- if (Double.isInfinite(test[i])) {
- LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
- continue trials;
- }
- }
- Arrays.sort(test);
- double q = KolmogorovSmirnovTest.simpleTest(test);
- if (LOG.isVeryVerbose()) {
- LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
- }
- if (best == null || q < bestq) {
- best = dist;
- bestq = q;
- }
- } catch (ArithmeticException e) {
- if (LOG.isVeryVerbose()) {
- LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
- }
- continue;
- }
- }
- if (LOG.isVerbose()) {
- LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
- }
- dists.add(best);
+ Distribution dist;
+ if(estimators.size() == 1) {
+ dist = estimators.get(0).estimate(castColumn, adapter);
+ }
+ else {
+ dist = findBestFit(castColumn, adapter, d, test);
+ }
+ // Special handling for constant distributions:
+ // We want them to remain 0, instead of - usually - becoming constant .5
+ if(dist instanceof UniformDistribution) {
+ dist = constantZero(castColumn, adapter) ? new UniformDistribution(0., 1.) : dist;
}
+ dists.add(dist);
}
// Normalization scan
double[] buf = new double[dim];
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
buf[d] = dists.get(d).cdf(obj.doubleValue(d));
}
castColumn.set(i, factory.newNumberVector(buf));
@@ -181,6 +159,71 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
return objects;
}
+ /**
+ * Find the best fitting distribution.
+ *
+ * @param col Column of table
+ * @param adapter Adapter for accessing the data
+ * @param d Dimension
+ * @param test Scatch space for testing goodness of fit
+ * @return Best fit distribution
+ */
+ protected Distribution findBestFit(final List<V> col, Adapter adapter, int d, double[] test) {
+ Distribution best = null;
+ double bestq = Double.POSITIVE_INFINITY;
+ trials: for(DistributionEstimator<?> est : estimators) {
+ try {
+ Distribution dist = est.estimate(col, adapter);
+ for(int i = 0; i < test.length; i++) {
+ test[i] = dist.cdf(col.get(i).doubleValue(d));
+ if(Double.isNaN(test[i])) {
+ LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ if(Double.isInfinite(test[i])) {
+ LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ }
+ Arrays.sort(test);
+ double q = KolmogorovSmirnovTest.simpleTest(test);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
+ }
+ if(best == null || q < bestq) {
+ best = dist;
+ bestq = q;
+ }
+ }
+ catch(ArithmeticException e) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
+ }
+ continue trials;
+ }
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
+ }
+ return best;
+ }
+
+ /**
+ * Test if an attribute is constant zero.
+ *
+ * @param column Column
+ * @param adapter Data accessor.
+ * @return {@code true} if all values are zero
+ */
+ protected boolean constantZero(List<V> column, Adapter adapter) {
+ for(int i = 0, s = adapter.size(column); i < s; i++) {
+ if(adapter.get(column, i) != 0.) {
+ return false;
+ }
+ }
+ return true;
+ }
+
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
@@ -198,8 +241,8 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
result.append('\n');
result.append("normalization distributions: ");
boolean first = true;
- for (DistributionEstimator<?> est : estimators) {
- if (!first) {
+ for(DistributionEstimator<?> est : estimators) {
+ if(!first) {
result.append(',');
}
first = false;
@@ -212,52 +255,52 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
* Array adapter class for vectors.
*
* @author Erich Schubert
- *
+ *
* @apiviz.exclude
*/
- private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector<?>>> {
+ private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector>> {
/**
* Dimension to process.
*/
int dim;
@Override
- public int size(List<? extends NumberVector<?>> array) {
+ public int size(List<? extends NumberVector> array) {
return array.size();
}
@Override
- public Double get(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public Double get(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return getDouble(array, off);
}
@Override
- public double getDouble(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public double getDouble(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).doubleValue(dim);
}
@Override
- public float getFloat(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public float getFloat(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).floatValue(dim);
}
@Override
- public int getInteger(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public int getInteger(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).intValue(dim);
}
@Override
- public short getShort(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public short getShort(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).shortValue(dim);
}
@Override
- public long getLong(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public long getLong(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).longValue(dim);
}
@Override
- public byte getByte(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public byte getByte(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).byteValue(dim);
}
}
@@ -269,7 +312,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Parameter for distribution estimators.
*/
@@ -287,7 +330,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
List<Class<? extends DistributionEstimator<?>>> def = new ArrayList<>(1);
def.add(BestFitEstimator.class);
estP.setDefaultValue(def);
- if (config.grab(estP)) {
+ if(config.grab(estP)) {
estimators = estP.instantiateClasses(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java
index 9a263171..e4af3a92 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
* Attribute-wise Normalization using the error function. This mostly makes
@@ -35,11 +37,12 @@ import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
*
* @author Erich Schubert
*
- * @param <O> Object type
+ * @param <V> Object type
*
* @apiviz.uses NumberVector
*/
-public class AttributeWiseErfNormalization<O extends NumberVector<?>> extends AbstractNormalization<O> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseErfNormalization"})
+public class AttributeWiseErfNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
@@ -53,26 +56,21 @@ public class AttributeWiseErfNormalization<O extends NumberVector<?>> extends Ab
}
@Override
- public O restore(O featureVector) {
- throw new UnsupportedOperationException("Not implemented yet.");
- }
-
- @Override
- protected O filterSingleObject(O obj) {
+ protected V filterSingleObject(V obj) {
double[] val = new double[obj.getDimensionality()];
- for (int i = 0; i < val.length; i++) {
+ for(int i = 0; i < val.length; i++) {
val[i] = NormalDistribution.erf(obj.doubleValue(i));
}
return factory.newNumberVector(val);
}
@Override
- protected SimpleTypeInformation<? super O> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ protected Logging getLogger() {
+ return LOG;
}
@Override
- protected Logging getLogger() {
- return LOG;
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
index 8c4f15e1..ec50aadd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,10 +31,13 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
@@ -54,7 +57,8 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
* @apiviz.uses NumberVector
*/
// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements Normalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMADNormalization"})
+public class AttributeWiseMADNormalization<V extends NumberVector> implements Normalization<V> {
/**
* Class logger.
*/
@@ -63,7 +67,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
/**
* Number vector factory.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
/**
* Stores the median in each dimension.
@@ -71,9 +75,9 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
private double[] median = new double[0];
/**
- * Stores the median absolute deviation in each dimension.
+ * Stores the inverse median absolute deviation in each dimension.
*/
- private double[] madsigma = new double[0];
+ private double[] imadsigma = new double[0];
/**
* Constructor.
@@ -84,13 +88,13 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (objects.dataLength() == 0) {
+ if(objects.dataLength() == 0) {
return objects;
}
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked")
@@ -103,61 +107,72 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
// Scan to find the best
final int dim = castType.getDimensionality();
median = new double[dim];
- madsigma = new double[dim];
+ imadsigma = new double[dim];
// Scratch space for testing:
double[] test = new double[castColumn.size()];
FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data.", dim, LOG) : null;
// We iterate over dimensions, this kind of filter needs fast random
// access.
- for (int d = 0; d < dim; d++) {
- for (int i = 0; i < test.length; i++) {
+ for(int d = 0; d < dim; d++) {
+ for(int i = 0; i < test.length; i++) {
test[i] = castColumn.get(i).doubleValue(d);
}
final double med = QuickSelect.median(test);
median[d] = med;
- for (int i = 0; i < test.length; i++) {
+ int zeros = 0;
+ for(int i = 0; i < test.length; i++) {
test[i] = Math.abs(test[i] - med);
+ if(test[i] == 0.) {
+ zeros++;
+ }
}
// Rescale the true MAD for the best standard deviation estimate:
- madsigma[d] = QuickSelect.median(test) * NormalDistribution.ONEBYPHIINV075;
- if (dprog != null) {
- dprog.incrementProcessed(LOG);
+ if(zeros < (test.length >>> 1)) {
+ imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
}
+ else if(zeros == test.length) {
+ LOG.warning("Constant attribute detected. Using MAD=1.");
+ imadsigma[d] = 1.; // Does not matter. Constant distribution.
+ }
+ else {
+ // We have more than 50% zeros, so the regular MAD estimate does not
+ // work. Generalize the MAD approach to use the 50% non-zero value:
+ final int rank = zeros + ((test.length - zeros) >> 1);
+ final double rel = .5 + rank * .5 / test.length;
+ imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
+ LOG.warning("Near-constant attribute detected. Using modified MAD.");
+ }
+ LOG.incrementProcessed(dprog);
}
- if (dprog != null) {
- dprog.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(dprog);
FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), LOG) : null;
// Normalization scan
double[] buf = new double[dim];
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
buf[d] = normalize(d, obj.doubleValue(d));
}
castColumn.set(i, factory.newNumberVector(buf));
- if (nprog != null) {
- nprog.incrementProcessed(LOG);
- }
- }
- if (nprog != null) {
- nprog.ensureCompleted(LOG);
+ LOG.incrementProcessed(nprog);
}
+ LOG.ensureCompleted(nprog);
}
return objects;
}
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if (featureVector.getDimensionality() == median.length) {
+ if(featureVector.getDimensionality() == median.length) {
double[] values = new double[featureVector.getDimensionality()];
- for (int d = 0; d < featureVector.getDimensionality(); d++) {
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
values[d] = restore(d, featureVector.doubleValue(d));
}
return factory.newNumberVector(values);
- } else {
+ }
+ else {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + median.length);
}
}
@@ -175,7 +190,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
* @return Normalized value
*/
private double normalize(int d, double val) {
- return (val - median[d]) / madsigma[d];
+ return (val - median[d]) * imadsigma[d];
}
/**
@@ -186,7 +201,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
* @return Normalized value
*/
private double restore(int d, double val) {
- return (val * madsigma[d]) + median[d];
+ return (val / imadsigma[d]) + median[d];
}
@Override
@@ -196,7 +211,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
result.append('\n');
result.append("normalization median: ").append(FormatUtil.format(median));
result.append('\n');
- result.append("normalization MAD sigma: ").append(FormatUtil.format(madsigma));
+ result.append("normalization scaling factor: ").append(FormatUtil.format(imadsigma));
return result.toString();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java
new file mode 100644
index 00000000..1039ab5b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java
@@ -0,0 +1,207 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+
+/**
+ * Normalization designed for data with a <em>meaningful zero</em>: Each
+ * attribute is scaled to have the same mean (but 0 is not changed).
+ *
+ * @author Erich Schubert
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMeanNormalization"})
+public class AttributeWiseMeanNormalization<V extends NumberVector> extends AbstractNormalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseMeanNormalization.class);
+
+ /**
+ * Stores the mean in each dimension.
+ */
+ private double[] mean = null;
+
+ /**
+ * Temporary storage used during initialization.
+ */
+ double[] sums = null;
+
+ /**
+ * Count the number of values seen.
+ */
+ int c = 0;
+
+ /**
+ * Constructor.
+ *
+ * @param mean Mean value
+ */
+ public AttributeWiseMeanNormalization(double[] mean) {
+ super();
+ this.mean = mean;
+ }
+
+ /**
+ * Constructor.
+ */
+ public AttributeWiseMeanNormalization() {
+ super();
+ }
+
+ @Override
+ protected boolean prepareStart(SimpleTypeInformation<V> in) {
+ return (mean == null || mean.length == 0);
+ }
+
+ @Override
+ protected void prepareProcessInstance(V featureVector) {
+ // First object? Then init. (We didn't have a dimensionality before!)
+ if(sums == null || sums.length == 0) {
+ int dimensionality = featureVector.getDimensionality();
+ sums = new double[dimensionality];
+ }
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ sums[d] += featureVector.doubleValue(d);
+ }
+ ++c;
+ }
+
+ @Override
+ protected void prepareComplete() {
+ StringBuilder buf = LOG.isVerbose() ? new StringBuilder() : null;
+ final int dimensionality = sums.length;
+ mean = new double[dimensionality];
+ if(buf != null) {
+ buf.append("Normalization parameters: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ mean[d] = sums[d] / c;
+ if(buf != null) {
+ buf.append(" m: ").append(mean[d]);
+ }
+ }
+ sums = null;
+ if(buf != null) {
+ LOG.debugFine(buf.toString());
+ }
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = normalize(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ if(featureVector.getDimensionality() != mean.length) {
+ throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
+ }
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+
+ /**
+ * Normalize a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double normalize(int d, double val) {
+ d = (mean.length == 1) ? 0 : d;
+ return val / mean[d];
+ }
+
+ /**
+ * Restore a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double restore(int d, double val) {
+ d = (mean.length == 1) ? 0 : d;
+ return val * mean[d];
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ double[][] coeff = linearEquationSystem.getCoefficents();
+ double[] rhs = linearEquationSystem.getRHS();
+ int[] row = linearEquationSystem.getRowPermutations();
+ int[] col = linearEquationSystem.getColumnPermutations();
+
+ for(int i = 0; i < coeff.length; i++) {
+ for(int r = 0; r < coeff.length; r++) {
+ double sum = 0.0;
+ for(int c = 0; c < coeff[0].length; c++) {
+ sum += coeff[row[r]][col[c]] / mean[c];
+ coeff[row[r]][col[c]] = coeff[row[r]][col[c]] / mean[c];
+ }
+ rhs[row[r]] = rhs[row[r]] + sum;
+ }
+ }
+
+ return new LinearEquationSystem(coeff, rhs, row, col);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization means: ").append(FormatUtil.format(mean));
+
+ return result.toString();
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java
index 47b6db5f..26a125ad 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,11 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -46,24 +49,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParamet
*
* @apiviz.uses NumberVector
*/
-// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMinMaxNormalization"})
+public class AttributeWiseMinMaxNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(AttributeWiseMinMaxNormalization.class);
/**
- * Parameter for minimum.
- */
- public static final OptionID MINIMA_ID = new OptionID("normalize.min", "a comma separated concatenation of the minimum values in each dimension that are mapped to 0. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
-
- /**
- * Parameter for maximum.
- */
- public static final OptionID MAXIMA_ID = new OptionID("normalize.max", "a comma separated concatenation of the maximum values in each dimension that are mapped to 1. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
-
- /**
* Stores the maximum in each dimension.
*/
private double[] maxima = new double[0];
@@ -130,16 +123,14 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if(featureVector.getDimensionality() == maxima.length && featureVector.getDimensionality() == minima.length) {
- double[] values = new double[featureVector.getDimensionality()];
- for(int d = 0; d < featureVector.getDimensionality(); d++) {
- values[d] = (featureVector.doubleValue(d) * (factor(d)) + minima[d]);
- }
- return factory.newNumberVector(values);
- }
- else {
+ if(featureVector.getDimensionality() != maxima.length || featureVector.getDimensionality() != minima.length) {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + maxima.length);
}
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = featureVector.doubleValue(d) * factor(d) + minima[d];
+ }
+ return factory.newNumberVector(values);
}
/**
@@ -174,8 +165,7 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
}
}
- LinearEquationSystem lq = new LinearEquationSystem(coeff, rhs, row, col);
- return lq;
+ return new LinearEquationSystem(coeff, rhs, row, col);
}
@Override
@@ -190,13 +180,13 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
}
@Override
- protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ protected Logging getLogger() {
+ return LOG;
}
@Override
- protected Logging getLogger() {
- return LOG;
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
}
/**
@@ -206,7 +196,17 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for minimum.
+ */
+ public static final OptionID MINIMA_ID = new OptionID("normalize.min", "a comma separated concatenation of the minimum values in each dimension that are mapped to 0. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for maximum.
+ */
+ public static final OptionID MAXIMA_ID = new OptionID("normalize.max", "a comma separated concatenation of the maximum values in each dimension that are mapped to 1. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
+
/**
* Stores the maximum in each dimension.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java
index a24cae25..a7241441 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,9 +26,12 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -48,32 +51,22 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParamet
*
* @apiviz.uses NumberVector
*/
-// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseVarianceNormalization", "z" })
+public class AttributeWiseVarianceNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(AttributeWiseVarianceNormalization.class);
/**
- * Parameter for means.
- */
- public static final OptionID MEAN_ID = new OptionID("normalize.mean", "a comma separated concatenation of the mean values in each dimension that are mapped to 0. If no value is specified, the mean value of the attribute range in this dimension will be taken.");
-
- /**
- * Parameter for stddevs.
- */
- public static final OptionID STDDEV_ID = new OptionID("normalize.stddev", "a comma separated concatenation of the standard deviations in each dimension that are scaled to 1. If no value is specified, the standard deviation of the attribute range in this dimension will be taken.");
-
- /**
* Stores the mean in each dimension.
*/
- private double[] mean = new double[0];
+ private double[] mean;
/**
* Stores the standard deviation in each dimension.
*/
- private double[] stddev = new double[0];
+ private double[] stddev;
/**
* Temporary storage used during initialization.
@@ -152,16 +145,14 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if(featureVector.getDimensionality() == mean.length) {
- double[] values = new double[featureVector.getDimensionality()];
- for(int d = 0; d < featureVector.getDimensionality(); d++) {
- values[d] = restore(d, featureVector.doubleValue(d));
- }
- return factory.newNumberVector(values);
- }
- else {
+ if(featureVector.getDimensionality() != mean.length) {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
}
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
}
/**
@@ -172,12 +163,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
* @return Normalized value
*/
private double normalize(int d, double val) {
- if(mean.length == 1) {
- return (val - mean[0]) / stddev[0];
- }
- else {
- return (val - mean[d]) / stddev[d];
- }
+ d = (mean.length == 1) ? 0 : d;
+ return (val - mean[d]) / stddev[d];
}
/**
@@ -188,12 +175,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
* @return Normalized value
*/
private double restore(int d, double val) {
- if(mean.length == 1) {
- return (val * stddev[0]) + mean[0];
- }
- else {
- return (val * stddev[d]) + mean[d];
- }
+ d = (mean.length == 1) ? 0 : d;
+ return (val * stddev[d]) + mean[d];
}
@Override
@@ -214,13 +197,7 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
}
}
- LinearEquationSystem lq = new LinearEquationSystem(coeff, rhs, row, col);
- return lq;
- }
-
- @Override
- protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ return new LinearEquationSystem(coeff, rhs, row, col);
}
@Override
@@ -240,6 +217,11 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
return LOG;
}
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+
/**
* Parameterization class.
*
@@ -247,7 +229,17 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for means.
+ */
+ public static final OptionID MEAN_ID = new OptionID("normalize.mean", "a comma separated concatenation of the mean values in each dimension that are mapped to 0. If no value is specified, the mean value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for stddevs.
+ */
+ public static final OptionID STDDEV_ID = new OptionID("normalize.stddev", "a comma separated concatenation of the standard deviations in each dimension that are scaled to 1. If no value is specified, the standard deviation of the attribute range in this dimension will be taken.");
+
/**
* Stores the mean in each dimension.
*/
@@ -261,22 +253,22 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- DoubleListParameter meanP = new DoubleListParameter(MEAN_ID, true);
- DoubleListParameter stddevP = new DoubleListParameter(STDDEV_ID, true);
- config.grab(meanP);
- config.grab(stddevP);
- // Note: grab first, then use isDefined, to ensure the stddev is grabbed.
- if(meanP.isDefined() && stddevP.isDefined()) {
+ DoubleListParameter meanP = new DoubleListParameter(MEAN_ID) //
+ .setOptional(true);
+ if(config.grab(meanP)) {
mean = ArrayLikeUtil.toPrimitiveDoubleArray(meanP.getValue());
+ }
+ DoubleListParameter stddevP = new DoubleListParameter(STDDEV_ID) //
+ .setOptional(true);
+ if(config.grab(stddevP)) {
stddev = ArrayLikeUtil.toPrimitiveDoubleArray(stddevP.getValue());
for(double d : stddev) {
- if(d == 0) {
+ if(d == 0.) {
config.reportError(new WrongParameterValueException("Standard deviations must not be 0."));
}
}
}
-
config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(meanP, stddevP));
config.checkConstraint(new EqualSizeGlobalConstraint(meanP, stddevP));
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java
index bb9c2aec..ca320ec6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
*/
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.IntegerVector;
@@ -34,21 +33,25 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
/**
* Normalize vectors according to their rank in the attributes.
*
- * Note: ranks are multiplied by 2, to be able to give ties an integer rank.
- * (e.g. first two records are tied at "1" then, followed by the next on "4")
+ * Note: <b>ranks are multiplied by 2</b>, to be able to give ties an integer
+ * rank. (e.g. when the first two records are tied, they both have rank "1"
+ * then, followed by the next on "4")
*
* @author Erich Schubert
*/
-public class RankTieNormalization implements ObjectFilter {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.IntegerRankTieNormalization"})
+public class IntegerRankTieNormalization implements ObjectFilter {
/**
* Constructor.
*/
- public RankTieNormalization() {
+ public IntegerRankTieNormalization() {
super();
}
@@ -57,6 +60,12 @@ public class RankTieNormalization implements ObjectFilter {
final int len = objects.dataLength();
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+ int[] order = new int[len];
+ for(int i = 0; i < len; i++) {
+ order[i] = i;
+ }
+ Sorter comparator = new Sorter();
+
for(int r = 0; r < objects.metaLength(); r++) {
final SimpleTypeInformation<?> type = objects.meta(r);
final List<?> column = objects.getColumn(r);
@@ -65,7 +74,7 @@ public class RankTieNormalization implements ObjectFilter {
continue;
}
@SuppressWarnings("unchecked")
- final List<? extends NumberVector<?>> castColumn = (List<? extends NumberVector<?>>) column;
+ final List<? extends NumberVector> castColumn = (List<? extends NumberVector>) column;
// Get the replacement type information
final int dim = ((VectorFieldTypeInformation<?>) type).getDimensionality();
final VectorFieldTypeInformation<IntegerVector> outType = new VectorFieldTypeInformation<>(IntegerVector.STATIC, dim);
@@ -73,29 +82,21 @@ public class RankTieNormalization implements ObjectFilter {
// Output vectors
int[][] posvecs = new int[len][dim];
// Sort for each dimension
- // TODO: an int[] array would be enough, if we could use a comparator...
- DoubleIntPair[] sorter = new DoubleIntPair[len];
- for(int i = 0; i < sorter.length; i++) {
- sorter[i] = new DoubleIntPair(Double.NaN, -1);
- }
for(int d = 0; d < dim; d++) {
- // fill array
- for(int i = 0; i < sorter.length; i++) {
- sorter[i].first = castColumn.get(i).doubleValue(d);
- sorter[i].second = i;
- }
// Sort
- Arrays.sort(sorter);
+ comparator.setup(castColumn, d);
+ IntegerArrayQuickSort.sort(order, comparator);
// Transfer positions to output vectors
- for(int sta = 0; sta < sorter.length;) {
+ for(int sta = 0; sta < order.length;) {
+ double v = castColumn.get(order[sta]).doubleValue(d);
// Compute ties
int end = sta + 1;
- while(end < sorter.length && !(sorter[sta].first < sorter[end].first)) {
+ while(end < order.length && !(v < castColumn.get(order[end]).doubleValue(d))) {
end++;
}
final int pos = (sta + end - 1);
for(int i = sta; i < end; i++) {
- posvecs[sorter[i].second][d] = pos;
+ posvecs[order[i]][d] = pos;
}
sta = end;
}
@@ -110,4 +111,40 @@ public class RankTieNormalization implements ObjectFilter {
}
return bundle;
}
+
+ /**
+ * Class to sort an index array by a particular dimension.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Sorter implements IntegerComparator {
+ /**
+ * Column to use for sorting.
+ */
+ List<? extends NumberVector> col;
+
+ /**
+ * Dimension to use for sorting.
+ */
+ int dim;
+
+ /**
+ * Configure the sorting class.
+ *
+ * @param col Column to read
+ * @param dim Dimension to use.
+ */
+ public void setup(List<? extends NumberVector> col, int dim) {
+ this.col = col;
+ this.dim = dim;
+ }
+
+ @Override
+ public int compare(int x, int y) {
+ final double vx = col.get(x).doubleValue(dim), vy = col.get(y).doubleValue(dim);
+ return (vx < vy) ? -1 : (vx == vy) ? 0 : +1;
+ }
+ }
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java
index 21263890..99054f83 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,11 +29,13 @@ import gnu.trove.map.hash.TIntDoubleHashMap;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
- * Normalization for text frequency vectors, using the inverse document
- * frequency.
+ * Normalization for text frequency (TF) vectors, using the inverse document
+ * frequency (IDF). See also: TF-IDF for text analysis.
*
* @author Erich Schubert
*
@@ -41,7 +43,8 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
*
* @param <V> Vector type
*/
-public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.InverseDocumentFrequencyNormalization" })
+public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
@@ -102,7 +105,7 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<
final int dim = featureVector.iterDim(it);
vals.put(dim, featureVector.iterDoubleValue(it) * idf.get(dim));
}
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
+ return ((SparseNumberVector.Factory<V>) factory).newNumberVector(vals, featureVector.getDimensionality());
}
@Override
@@ -112,12 +115,12 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<
final int dim = featureVector.iterDim(it);
vals.put(dim, featureVector.iterDoubleValue(it) / idf.get(dim));
}
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
+ return ((SparseNumberVector.Factory<V>) factory).newNumberVector(vals, featureVector.getDimensionality());
}
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.SPARSE_VECTOR_FIELD;
+ return TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH;
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java
new file mode 100644
index 00000000..f1fac885
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Normalizations operating on columns / variates; where each column is treated independently.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java
new file mode 100644
index 00000000..b2da96a9
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java
@@ -0,0 +1,97 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Normalize histograms by scaling them to L1 norm 1, then taking the square
+ * root in each attribute.
+ *
+ * Using Euclidean distance (linear kernel) and this transformation is the same
+ * as using Hellinger distance:
+ * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.probabilistic.HellingerDistanceFunction}
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.HellingerHistogramNormalization" })
+public class HellingerHistogramNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Static instance.
+ */
+ public static final HellingerHistogramNormalization<NumberVector> STATIC = new HellingerHistogramNormalization<>();
+
+ /**
+ * Constructor.
+ */
+ public HellingerHistogramNormalization() {
+ super();
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] data = new double[featureVector.getDimensionality()];
+ double sum = 0.;
+ for(int d = 0; d < data.length; ++d) {
+ data[d] = featureVector.doubleValue(d);
+ data[d] = data[d] > 0 ? data[d] : -data[d];
+ sum += data[d];
+ }
+ // Normalize and sqrt:
+ if(sum > 0.) {
+ for(int d = 0; d < data.length; ++d) {
+ if(data[d] > 0) {
+ data[d] = Math.sqrt(data[d] / sum);
+ }
+ }
+ }
+ return factory.newNumberVector(data);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected HellingerHistogramNormalization<NumberVector> makeInstance() {
+ return STATIC;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java
new file mode 100644
index 00000000..05485909
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java
@@ -0,0 +1,159 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Normalize vectors such that they have zero mean and unit variance.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class InstanceMeanVarianceNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Multiplicity of the vector.
+ */
+ private int multiplicity;
+
+ /**
+ * Constructor.
+ */
+ public InstanceMeanVarianceNormalization() {
+ super();
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] raw = featureVector.getColumnVector().getArrayRef();
+ if(raw.length == 0) {
+ return factory.newNumberVector(new double[] {});
+ }
+ if(raw.length == 1) {
+ // Constant, but preserve NaNs
+ return factory.newNumberVector(new double[] { raw[0] == raw[0] ? 0. : Double.NaN });
+ }
+ // Multivariate codepath:
+ if(multiplicity > 1) {
+ assert (raw.length % multiplicity == 0) : "Vector length is not divisible by multiplicity?";
+ return factory.newNumberVector(multivariateStandardization(raw));
+ }
+ return factory.newNumberVector(univariateStandardization(raw));
+ }
+
+ protected double[] univariateStandardization(double[] raw) {
+ // Two pass normalization is numerically most stable,
+ // And Java should optimize this well enough.
+ double sum = 0.;
+ for(int i = 0; i < raw.length; ++i) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ sum += v;
+ }
+ final double mean = sum / raw.length;
+ double ssum = 0.;
+ for(int i = 0; i < raw.length; ++i) {
+ double v = raw[i] - mean;
+ if(v != v) {
+ continue;
+ }
+ ssum += v * v;
+ }
+ final double std = Math.sqrt(ssum) / (raw.length - 1);
+ if(std > 0.) {
+ for(int i = 0; i < raw.length; ++i) {
+ raw[i] = (raw[i] - mean) / std;
+ }
+ }
+ return raw;
+ }
+
+ protected double[] multivariateStandardization(double[] raw) {
+ final int len = raw.length / multiplicity;
+ if(len <= 1) {
+ return raw;
+ }
+ // Two pass normalization is numerically most stable,
+ // And Java should optimize this well enough.
+ double[] mean = new double[multiplicity];
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mean[j] += v;
+ }
+ for(int j = 0; j < multiplicity; ++j) {
+ mean[j] /= len;
+ }
+ double[] std = new double[multiplicity];
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ double v = raw[i] - mean[j];
+ if(v != v) {
+ continue;
+ }
+ std[j] += v * v;
+ }
+ for(int j = 0; j < multiplicity; ++j) {
+ std[j] = std[j] > 0. ? Math.sqrt(std[j]) / (len - 1) : 1;
+ }
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ raw[i] = (raw[i] - mean[j]) / std[j];
+ }
+ return raw;
+ }
+
+ @Override
+ protected void initializeOutputType(SimpleTypeInformation<V> type) {
+ super.initializeOutputType(type);
+ multiplicity = ((VectorTypeInformation<?>) type).getMultiplicity();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ @Override
+ protected InstanceMeanVarianceNormalization<V> makeInstance() {
+ return new InstanceMeanVarianceNormalization<>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java
new file mode 100644
index 00000000..9f8f7680
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java
@@ -0,0 +1,177 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+
+/**
+ * Normalize vectors such that the smallest attribute is 0, the largest is 1.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class InstanceMinMaxNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Minimum and maximum values.
+ */
+ private double min, max;
+
+ /**
+ * Multiplicity of the vector.
+ */
+ private int multiplicity;
+
+ /**
+ * Constructor.
+ *
+ * @param min Desired minimum value
+ * @param max Desired maximum value
+ */
+ public InstanceMinMaxNormalization(double min, double max) {
+ super();
+ this.min = min;
+ this.max = max;
+ }
+
+ /**
+ * Constructor, normalizing to {@code [0;1]}
+ */
+ public InstanceMinMaxNormalization() {
+ this(0., 1.);
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] raw = featureVector.getColumnVector().getArrayRef();
+ // Multivariate codepath:
+ if(multiplicity > 1) {
+ assert (raw.length % multiplicity == 0) : "Vector length is not divisible by multiplicity?";
+ double[] mi = new double[multiplicity], ma = new double[multiplicity];
+ for(int i = 0; i < multiplicity; i++) {
+ mi[i] = Double.POSITIVE_INFINITY;
+ ma[i] = Double.NEGATIVE_INFINITY;
+ }
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mi[j] = (mi[j] < v) ? mi[j] : v;
+ ma[j] = (ma[j] > v) ? ma[j] : v;
+ }
+ for(int j = 0; j < multiplicity; j++) {
+ if(mi[j] < ma[j]) {
+ final double s = (max - min) / (ma[j] - mi[j]);
+ for(int i = 0; i < raw.length; i += multiplicity) {
+ raw[i] = (raw[i] - mi[j]) * s + min;
+ }
+ }
+ }
+ return factory.newNumberVector(raw);
+ }
+ // Default codepath
+ double mi = Double.POSITIVE_INFINITY, ma = Double.NEGATIVE_INFINITY;
+ for(int i = 0; i < raw.length; ++i) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mi = (mi < v) ? mi : v;
+ ma = (ma > v) ? ma : v;
+ }
+ if(mi < ma) {
+ final double s = (max - min) / (ma - mi);
+ for(int i = 0; i < raw.length; ++i) {
+ raw[i] = (raw[i] - mi) * s + min;
+ }
+ }
+ return factory.newNumberVector(raw);
+ }
+
+ @Override
+ protected void initializeOutputType(SimpleTypeInformation<V> type) {
+ super.initializeOutputType(type);
+ multiplicity = ((VectorTypeInformation<?>) type).getMultiplicity();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Option ID for minimum value.
+ */
+ public static final OptionID MIN_ID = new OptionID("normalization.min", "Minimum value to assign to objects.");
+
+ /**
+ * Option ID for maximum value.
+ */
+ public static final OptionID MAX_ID = new OptionID("normalization.max", "Maximum value to assign to objects.");
+
+ /**
+ * Minimum and maximum values.
+ */
+ private double min, max;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ DoubleParameter minP = new DoubleParameter(MIN_ID, 0.) //
+ .setOptional(true);
+ if(config.grab(minP)) {
+ min = minP.doubleValue();
+ }
+ DoubleParameter maxP = new DoubleParameter(MAX_ID, 1.) //
+ .setOptional(true);
+ if(config.grab(maxP)) {
+ max = maxP.doubleValue();
+ }
+ config.checkConstraint(new LessGlobalConstraint<>(minP, maxP));
+ }
+
+ @Override
+ protected InstanceMinMaxNormalization<V> makeInstance() {
+ return new InstanceMinMaxNormalization<>(min, max);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java
index a12dea3b..51b2a34b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,9 +26,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.Norm;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -42,42 +43,32 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> vector type
*/
-public class LengthNormalization<V extends NumberVector<?>> extends AbstractStreamNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.LengthNormalization"})
+public class LengthNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
/**
* Norm to use.
*/
- DoubleNorm<? super V> norm;
+ Norm<? super V> norm;
/**
* Constructor.
*
* @param norm Norm to use
*/
- public LengthNormalization(DoubleNorm<? super V> norm) {
+ public LengthNormalization(Norm<? super V> norm) {
super();
this.norm = norm;
}
@Override
protected V filterSingleObject(V featureVector) {
- final double d = norm.doubleNorm(featureVector);
+ final double d = norm.norm(featureVector);
return factory.newNumberVector(featureVector.getColumnVector().timesEquals(1 / d).getArrayRef());
}
@Override
- public V restore(V featureVector) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
- // TODO.
- throw new UnsupportedOperationException();
- }
-
- @Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
}
/**
@@ -87,7 +78,7 @@ public class LengthNormalization<V extends NumberVector<?>> extends AbstractStre
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Option ID for normalization norm.
*/
@@ -96,12 +87,12 @@ public class LengthNormalization<V extends NumberVector<?>> extends AbstractStre
/**
* Norm to use.
*/
- DoubleNorm<? super V> norm;
+ Norm<? super V> norm;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- ObjectParameter<DoubleNorm<? super V>> normP = new ObjectParameter<>(NORM_ID, DoubleNorm.class, EuclideanDistanceFunction.class);
+ ObjectParameter<Norm<? super V>> normP = new ObjectParameter<>(NORM_ID, Norm.class, EuclideanDistanceFunction.class);
if(config.grab(normP)) {
norm = normP.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java
new file mode 100644
index 00000000..8970e7ef
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java
@@ -0,0 +1,119 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+
+/**
+ * Normalize the data set by applying log(1+|x|*b)/log(b+1) to any value. If the
+ * input data was in [0;1], then the resulting values will be in the same range.
+ *
+ * By default b=1, and thus the transformation is log2(1+|x|).
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.Log1PlusNormalization" })
+public class Log1PlusNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Static instance.
+ */
+ public static final Log1PlusNormalization<NumberVector> STATIC = new Log1PlusNormalization<>(1.);
+
+ /**
+ * Boosting factor, and scaling coefficient.
+ */
+ protected double boost, scale;
+
+ /**
+ * Constructor.
+ *
+ * @param boost Boosting parameter
+ */
+ public Log1PlusNormalization(double boost) {
+ super();
+ this.boost = boost;
+ this.scale = 1. / Math.log1p(boost);
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] data = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < data.length; ++d) {
+ data[d] = featureVector.doubleValue(d);
+ data[d] = Math.log1p((data[d] > 0 ? data[d] : -data[d]) * boost) * scale;
+ }
+ return factory.newNumberVector(data);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Boosting factor parameter.
+ */
+ public static final OptionID BOOST_ID = new OptionID("log1pscale.boost", "Boosting factor. Larger values will yield a steeper curve.");
+
+ /**
+ * Boosting factor.
+ */
+ protected double boost;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ DoubleParameter boostP = new DoubleParameter(BOOST_ID, 1.) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(boostP)) {
+ boost = boostP.doubleValue();
+ }
+ }
+
+ @Override
+ protected Log1PlusNormalization<V> makeInstance() {
+ return new Log1PlusNormalization<>(boost);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java
new file mode 100644
index 00000000..9ac613c0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Instancewise normalization, where each instance is normalized independently.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
index 15d689d7..552d7003 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team