summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/filter
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/filter')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java23
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java31
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java58
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/DropNaNFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java)56
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/NoMissingValuesFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java)9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/ReplaceNaNWithRandomFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java)65
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java219
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java77
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java326
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java)171
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java)28
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java)81
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java207
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java)58
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java)96
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java)83
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java)19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java97
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java159
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java177
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java)37
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java119
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java3
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ByLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java)26
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/RandomSamplingStreamFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java)9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ShuffleObjectsFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java)23
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/SortByLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java)19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java33
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/HistogramJitterFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java)13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java436
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java)9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFromPatternFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java)44
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ExternalIDFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java)10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java124
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SparseVectorFieldFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java)12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SplitNumberVectorFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java)17
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java27
62 files changed, 2671 insertions, 618 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
index 1cb68b30..8bedbcc3 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -58,6 +58,7 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
}
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+ final Logging logger = getLogger();
for(int r = 0; r < objects.metaLength(); r++) {
@SuppressWarnings("unchecked")
SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) objects.meta(r);
@@ -73,18 +74,14 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
// When necessary, perform an initialization scan
if(prepareStart(castType)) {
- FiniteProgress pprog = getLogger().isVerbose() ? new FiniteProgress("Preparing normalization.", objects.dataLength(), getLogger()) : null;
+ FiniteProgress pprog = logger.isVerbose() ? new FiniteProgress("Preparing normalization.", objects.dataLength(), logger) : null;
for(Object o : column) {
@SuppressWarnings("unchecked")
final I obj = (I) o;
prepareProcessInstance(obj);
- if (pprog != null) {
- pprog.incrementProcessed(getLogger());
- }
- }
- if (pprog != null) {
- pprog.ensureCompleted(getLogger());
+ logger.incrementProcessed(pprog);
}
+ logger.ensureCompleted(pprog);
prepareComplete();
}
@@ -93,19 +90,15 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
bundle.appendColumn(convertedType(castType), castColumn);
// Normalization scan
- FiniteProgress nprog = getLogger().isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), getLogger()) : null;
+ FiniteProgress nprog = logger.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), logger) : null;
for(int i = 0; i < objects.dataLength(); i++) {
@SuppressWarnings("unchecked")
final I obj = (I) column.get(i);
final O normalizedObj = filterSingleObject(obj);
castColumn.set(i, normalizedObj);
- if (nprog != null) {
- nprog.incrementProcessed(getLogger());
- }
- }
- if (nprog != null) {
- nprog.ensureCompleted(getLogger());
+ logger.incrementProcessed(nprog);
}
+ logger.ensureCompleted(nprog);
}
return bundle;
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
index 5b48a8c0..0d4b5f8d 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
index 6a210db3..dca3d221 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
@@ -1,14 +1,10 @@
package de.lmu.ifi.dbs.elki.datasource.filter;
-import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +22,10 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+
/**
* Abstract base class for streaming filters.
*
@@ -39,12 +39,27 @@ public abstract class AbstractStreamFilter implements StreamFilter {
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- init(new StreamFromBundle(objects));
- return MultipleObjectsBundle.fromStream(this);
+ return init(objects.asStream()).asMultipleObjectsBundle();
}
@Override
- public void init(BundleStreamSource source) {
+ public BundleStreamSource init(BundleStreamSource source) {
this.source = source;
+ return this;
+ }
+
+ @Override
+ public boolean hasDBIDs() {
+ return source.hasDBIDs();
+ }
+
+ @Override
+ public boolean assignDBID(DBIDVar var) {
+ return source.assignDBID(var);
+ }
+
+ @Override
+ public MultipleObjectsBundle asMultipleObjectsBundle() {
+ return MultipleObjectsBundle.fromStream(this);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java
index b9305aa6..c565a36c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -34,11 +34,11 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
* @param <I> Input vector type
* @param <O> Output vector type
*/
-public abstract class AbstractVectorConversionFilter<I, O extends NumberVector<?>> extends AbstractConversionFilter<I, O> {
+public abstract class AbstractVectorConversionFilter<I, O extends NumberVector> extends AbstractConversionFilter<I, O> {
/**
* Number vector factory.
*/
- protected NumberVector.Factory<O, ?> factory;
+ protected NumberVector.Factory<O> factory;
/**
* Initialize factory from a data type.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java
index 6a15c41c..b9c337bc 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,11 +33,11 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
* @param <I> Input type
* @param <O> Output vector type
*/
-public abstract class AbstractVectorStreamConversionFilter<I, O extends NumberVector<?>> extends AbstractStreamConversionFilter<I, O> {
+public abstract class AbstractVectorStreamConversionFilter<I, O extends NumberVector> extends AbstractStreamConversionFilter<I, O> {
/**
* Number vector factory.
*/
- protected NumberVector.Factory<O, ?> factory;
+ protected NumberVector.Factory<O> factory;
/**
* Initialize factory from a data type.
@@ -47,5 +47,4 @@ public abstract class AbstractVectorStreamConversionFilter<I, O extends NumberVe
protected void initializeOutputType(SimpleTypeInformation<O> type) {
factory = FilterUtil.guessFactory(type);
}
-
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java
index 9ef9d34f..8873c9a1 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,7 @@ import java.lang.reflect.Field;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
-import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
/**
@@ -51,16 +51,16 @@ public final class FilterUtil {
* @return Factory
*/
@SuppressWarnings("unchecked")
- public static <V extends NumberVector<?>> NumberVector.Factory<V, ?> guessFactory(SimpleTypeInformation<V> in) {
- NumberVector.Factory<V, ?> factory = null;
- if(in instanceof VectorFieldTypeInformation) {
- factory = (NumberVector.Factory<V, ?>) ((VectorFieldTypeInformation<V>) in).getFactory();
+ public static <V extends NumberVector> NumberVector.Factory<V> guessFactory(SimpleTypeInformation<V> in) {
+ NumberVector.Factory<V> factory = null;
+ if(in instanceof VectorTypeInformation) {
+ factory = (NumberVector.Factory<V> ) ((VectorTypeInformation<V>) in).getFactory();
}
if(factory == null) {
// FIXME: hack. Add factories to simple type information, too?
try {
Field f = in.getRestrictionClass().getField("FACTORY");
- factory = (NumberVector.Factory<V, ?>) f.get(null);
+ factory = (NumberVector.Factory<V> ) f.get(null);
}
catch(Exception e) {
LoggingUtil.warning("Cannot determine factory for type " + in.getRestrictionClass(), e);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
index ce02fc29..3c66ad85 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,12 +23,13 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDFactory;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -40,7 +41,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*
* @apiviz.has DBID oneway - - «produces»
*/
-public class FixedDBIDsFilter extends AbstractStreamFilter {
+public class FixedDBIDsFilter implements ObjectFilter {
/**
* The filtered meta
*/
@@ -62,35 +63,11 @@ public class FixedDBIDsFilter extends AbstractStreamFilter {
}
@Override
- public BundleMeta getMeta() {
- return meta;
- }
-
- @Override
- public Event nextEvent() {
- Event ev = source.nextEvent();
- if(ev == Event.META_CHANGED) {
- if(meta == null) {
- meta = new BundleMeta();
- meta.add(TypeUtil.DBID);
- }
- BundleMeta origmeta = source.getMeta();
- // Note -1 for the injected DBID column
- for(int i = meta.size() - 1; i < origmeta.size(); i++) {
- meta.add(origmeta.get(i));
- }
- }
- return ev;
- }
-
- @Override
- public Object data(int rnum) {
- if(rnum == 0) {
- DBID ret = DBIDUtil.importInteger(curid);
- curid++;
- return ret;
- }
- return source.data(rnum - 1);
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ DBIDRange ids = DBIDFactory.FACTORY.generateStaticDBIDRange(curid, objects.dataLength());
+ objects.setDBIDs(ids);
+ curid += objects.dataLength();
+ return objects;
}
/**
@@ -108,19 +85,24 @@ public class FixedDBIDsFilter extends AbstractStreamFilter {
* </p>
*/
public static final OptionID IDSTART_ID = new OptionID("dbc.startid", "Object ID to start counting with");
- int startid = -1;
+
+ /**
+ * First ID to use.
+ */
+ int startid = 0;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter startidParam = new IntParameter(IDSTART_ID);
+ IntParameter startidParam = new IntParameter(IDSTART_ID, 0) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(startidParam)) {
- startid = startidParam.getValue().intValue();
+ startid = startidParam.intValue();
}
}
@Override
- protected Object makeInstance() {
+ protected FixedDBIDsFilter makeInstance() {
return new FixedDBIDsFilter(startid);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
index ce758763..09896f15 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
index 5073cea8..a6c364aa 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
index 798cd05d..45464f31 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
@@ -1,12 +1,10 @@
package de.lmu.ifi.dbs.elki.datasource.filter;
-import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,9 +23,11 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+
/**
- * Streaming filters are often more efficient (less memory use) and can be used
- * in more settings.
+ * Streaming filters are often more efficient (less memory use) as they do not
+ * keep a reference to earlier data.
*
* @author Erich Schubert
*
@@ -41,5 +41,5 @@ public interface StreamFilter extends ObjectFilter, BundleStreamSource {
*
* @param source Stream source
*/
- public void init(BundleStreamSource source);
+ public BundleStreamSource init(BundleStreamSource source);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/DropNaNFilter.java
index fb9cf83e..4c226085 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/DropNaNFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -30,7 +30,9 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -43,6 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.DropNaNFilter" })
public class DropNaNFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -73,33 +76,33 @@ public class DropNaNFilter extends AbstractStreamFilter {
@Override
public Event nextEvent() {
- while (true) {
+ while(true) {
Event ev = source.nextEvent();
- switch(ev) {
+ switch(ev){
case END_OF_STREAM:
return ev;
case META_CHANGED:
updateMeta(source.getMeta());
return ev;
case NEXT_OBJECT:
- if (densecols == null) {
+ if(densecols == null) {
updateMeta(source.getMeta());
}
boolean good = true;
- for (int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
- NumberVector<?> v = (NumberVector<?>) source.data(j);
- if (v == null) {
+ for(int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
+ NumberVector v = (NumberVector) source.data(j);
+ if(v == null) {
good = false;
break;
}
- for (int i = 0; i < v.getDimensionality(); i++) {
- if (Double.isNaN(v.doubleValue(i))) {
+ for(int i = 0; i < v.getDimensionality(); i++) {
+ if(Double.isNaN(v.doubleValue(i))) {
good = false;
break;
}
}
}
- if (good) {
+ if(good) {
return ev;
}
continue;
@@ -114,21 +117,22 @@ public class DropNaNFilter extends AbstractStreamFilter {
*/
private void updateMeta(BundleMeta meta) {
int cols = meta.size();
- if (densecols == null) {
+ if(densecols == null) {
densecols = new BitSet();
- } else {
+ }
+ else {
densecols.clear();
}
- for (int i = 0; i < cols; i++) {
- if (TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
+ for(int i = 0; i < cols; i++) {
+ if(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
throw new AbortException("Filtering sparse vectors is not yet supported by this filter. Please contribute.");
}
// TODO: only check for double and float?
- if (TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
densecols.set(i);
continue;
}
- if (TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
densecols.set(i);
continue;
}
@@ -137,32 +141,32 @@ public class DropNaNFilter extends AbstractStreamFilter {
@Override
public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if (LOG.isDebuggingFinest()) {
+ if(LOG.isDebuggingFinest()) {
LOG.debugFinest("Removing records with NaN values.");
}
updateMeta(objects.meta());
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
bundle.appendColumn(objects.meta(j), new ArrayList<>());
}
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final Object[] row = objects.getRow(i);
boolean good = true;
- for (int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
- NumberVector<?> v = (NumberVector<?>) row[j];
- if (v == null) {
+ for(int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
+ NumberVector v = (NumberVector) row[j];
+ if(v == null) {
good = false;
break;
}
- for (int d = 0; d < v.getDimensionality(); d++) {
- if (Double.isNaN(v.doubleValue(d))) {
+ for(int d = 0; d < v.getDimensionality(); d++) {
+ if(Double.isNaN(v.doubleValue(d))) {
good = false;
break;
}
}
}
- if (good) {
+ if(good) {
bundle.appendSimple(row);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/NoMissingValuesFilter.java
index b3f0af53..9b7ab977 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/NoMissingValuesFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,9 @@ import java.util.ArrayList;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
@@ -35,6 +37,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.NoMissingValuesFilter" })
public class NoMissingValuesFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -123,7 +126,7 @@ public class NoMissingValuesFilter extends AbstractStreamFilter {
*/
public static class Parameterizer extends AbstractParameterizer {
@Override
- protected Object makeInstance() {
+ protected NoMissingValuesFilter makeInstance() {
return new NoMissingValuesFilter();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/ReplaceNaNWithRandomFilter.java
index 9029d8ea..96a5f059 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/ReplaceNaNWithRandomFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -30,8 +30,10 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -47,6 +49,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ReplaceNaNWithRandomFilter" })
public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -56,7 +59,7 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
/**
* Columns to check.
*/
- private NumberVector.Factory<?, ?>[] densecols = null;
+ private NumberVector.Factory<?>[] densecols = null;
/**
* Distribution to generate replacement values with.
@@ -88,28 +91,28 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
@Override
public Event nextEvent() {
- while (true) {
+ while(true) {
Event ev = source.nextEvent();
- switch(ev) {
+ switch(ev){
case END_OF_STREAM:
return ev;
case META_CHANGED:
updateMeta(source.getMeta());
return ev;
case NEXT_OBJECT:
- if (densecols == null) {
+ if(densecols == null) {
updateMeta(source.getMeta());
}
rows.clear();
- for (int j = 0; j < densecols.length; j++) {
+ for(int j = 0; j < densecols.length; j++) {
Object o = source.data(j);
- if (densecols[j] != null) {
- NumberVector<?> v = (NumberVector<?>) o;
+ if(densecols[j] != null) {
+ NumberVector v = (NumberVector) o;
double[] ro = null; // replacement
- if (v != null) {
- for (int i = 0; i < v.getDimensionality(); i++) {
- if (Double.isNaN(v.doubleValue(i))) {
- if (ro != null) {
+ if(v != null) {
+ for(int i = 0; i < v.getDimensionality(); i++) {
+ if(Double.isNaN(v.doubleValue(i))) {
+ if(ro != null) {
ro = v.getColumnVector().getArrayRef();
}
ro[i] = dist.nextRandom();
@@ -132,19 +135,19 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
*/
private void updateMeta(BundleMeta meta) {
final int cols = meta.size();
- densecols = new NumberVector.Factory<?, ?>[cols];
- for (int i = 0; i < cols; i++) {
- if (TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
+ densecols = new NumberVector.Factory<?>[cols];
+ for(int i = 0; i < cols; i++) {
+ if(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
throw new AbortException("Filtering sparse vectors is not yet supported by this filter. Please contribute.");
}
- if (TypeUtil.FLOAT_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.FLOAT_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
VectorFieldTypeInformation<?> vmeta = (VectorFieldTypeInformation<?>) meta.get(i);
- densecols[i] = (NumberVector.Factory<?, ?>) vmeta.getFactory();
+ densecols[i] = (NumberVector.Factory<?>) vmeta.getFactory();
continue;
}
- if (TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
VectorFieldTypeInformation<?> vmeta = (VectorFieldTypeInformation<?>) meta.get(i);
- densecols[i] = (NumberVector.Factory<?, ?>) vmeta.getFactory();
+ densecols[i] = (NumberVector.Factory<?>) vmeta.getFactory();
continue;
}
}
@@ -152,25 +155,25 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
@Override
public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if (LOG.isDebuggingFinest()) {
+ if(LOG.isDebuggingFinest()) {
LOG.debugFinest("Removing records with NaN values.");
}
updateMeta(objects.meta());
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
bundle.appendColumn(objects.meta(j), new ArrayList<>());
}
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final Object[] row = objects.getRow(i);
- for (int j = 0; j < densecols.length; j++) {
- if (densecols[j] != null) {
- NumberVector<?> v = (NumberVector<?>) row[j];
+ for(int j = 0; j < densecols.length; j++) {
+ if(densecols[j] != null) {
+ NumberVector v = (NumberVector) row[j];
double[] ro = null; // replacement
- if (v != null) {
- for (int d = 0; d < v.getDimensionality(); d++) {
- if (Double.isNaN(v.doubleValue(d))) {
- if (ro != null) {
+ if(v != null) {
+ for(int d = 0; d < v.getDimensionality(); d++) {
+ if(Double.isNaN(v.doubleValue(d))) {
+ if(ro != null) {
ro = v.getColumnVector().getArrayRef();
}
ro[d] = dist.nextRandom();
@@ -207,7 +210,7 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<Distribution> distP = new ObjectParameter<>(REPLACEMENT_DISTRIBUTION, Distribution.class);
- if (config.grab(distP)) {
+ if(config.grab(distP)) {
dist = distP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java
new file mode 100644
index 00000000..5d4e2d2a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java
@@ -0,0 +1,219 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * Filter to remove all vectors that do not have the desired dimensionality.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.VectorDimensionalityFilter" })
+public class VectorDimensionalityFilter<V extends NumberVector> extends AbstractStreamFilter {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(VectorDimensionalityFilter.class);
+
+ /**
+ * The filtered meta.
+ */
+ BundleMeta meta;
+
+ /**
+ * The column to filter.
+ */
+ int column = -1;
+
+ /**
+ * Desired dimensionality.
+ */
+ int dim = -1;
+
+ /**
+ * Constructor.
+ *
+ * @param dim Dimensionality to enforce (use -1 to use the dimensionality of
+ * the first vector in the data set)
+ */
+ public VectorDimensionalityFilter(int dim) {
+ super();
+ this.dim = dim;
+ }
+
+ @Override
+ public BundleMeta getMeta() {
+ if(meta == null) {
+ updateMeta();
+ }
+ return source.getMeta();
+ }
+
+ @Override
+ public Object data(int rnum) {
+ return source.data(rnum);
+ }
+
+ @Override
+ public Event nextEvent() {
+ while(true) {
+ Event ev = source.nextEvent();
+ switch(ev){
+ case END_OF_STREAM:
+ return ev;
+ case META_CHANGED:
+ meta = null;
+ return ev;
+ case NEXT_OBJECT:
+ if(meta == null) {
+ updateMeta();
+ }
+ if(column >= 0 && dim >= 0) {
+ @SuppressWarnings("unchecked")
+ V vec = (V) source.data(column);
+ if(vec == null) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Skipping null vector.");
+ }
+ continue;
+ }
+ if(vec.getDimensionality() != dim) {
+ if(LOG.isVeryVerbose()) {
+ StringBuilder buf = new StringBuilder();
+ buf.append("Skipping vector of wrong dimensionality ");
+ buf.append(vec.getDimensionality());
+ buf.append(':');
+ for(int i = 0; i < meta.size(); i++) {
+ buf.append(' ');
+ buf.append(source.data(i));
+ }
+ LOG.veryverbose(buf.toString());
+ }
+ continue;
+ }
+ }
+ return ev;
+ }
+ }
+ }
+
+ /**
+ * Update metadata.
+ */
+ private void updateMeta() {
+ meta = new BundleMeta();
+ BundleMeta origmeta = source.getMeta();
+ for(int i = 0; i < origmeta.size(); i++) {
+ SimpleTypeInformation<?> type = origmeta.get(i);
+ if(column < 0) {
+ // Test whether this type matches
+ if(TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(type)) {
+ if(type instanceof VectorFieldTypeInformation) {
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+ if(dim != -1 && castType.mindim() > dim) {
+ throw new AbortException("Would filter all vectors: minimum dimensionality " + castType.mindim() + " > desired dimensionality " + dim);
+ }
+ if(dim != -1 && castType.maxdim() < dim) {
+ throw new AbortException("Would filter all vectors: maximum dimensionality " + castType.maxdim() + " < desired dimensionality " + dim);
+ }
+ if(dim == -1) {
+ dim = castType.mindim();
+ }
+ if(castType.mindim() == castType.maxdim()) {
+ meta.add(castType);
+ column = i;
+ continue;
+ }
+ }
+ @SuppressWarnings("unchecked")
+ final VectorTypeInformation<V> castType = (VectorTypeInformation<V>) type;
+ if(dim != -1) {
+ meta.add(new VectorFieldTypeInformation<>(FilterUtil.guessFactory(castType), dim, dim, castType.getSerializer()));
+ }
+ else {
+ LOG.warning("No dimensionality yet for column " + i);
+ meta.add(castType);
+ }
+ column = i;
+ continue;
+ }
+ }
+ meta.add(type);
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for specifying the dimensionality.
+ */
+ private static final OptionID DIM_P = new OptionID("filter.dim", "Dimensionality of vectors to retain.");
+
+ /**
+ * Desired dimensionality.
+ */
+ int dim = -1;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter dimP = new IntParameter(DIM_P)//
+ .setOptional(true)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ dim = config.grab(dimP) ? dimP.intValue() : -1;
+ }
+
+ @Override
+ protected VectorDimensionalityFilter<V> makeInstance() {
+ return new VectorDimensionalityFilter<>(dim);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java
new file mode 100644
index 00000000..b2d47e69
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Filters for data cleaning.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
index 0b4d7ae0..8ad13355 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,9 +33,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
*
* @author Elke Achtert
*
- * @param <O> Object type processed
+ * @param <V> Object type processed
*/
-public abstract class AbstractNormalization<O extends NumberVector<?>> extends AbstractVectorConversionFilter<O, O> implements Normalization<O> {
+public abstract class AbstractNormalization<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> implements Normalization<V> {
/**
* Initializes the option handler and the parameter map.
*/
@@ -44,12 +44,18 @@ public abstract class AbstractNormalization<O extends NumberVector<?>> extends A
}
@Override
- protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<O> in) {
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
initializeOutputType(in);
return in;
}
@Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ // FIXME: implement everywhere.
+ throw new UnsupportedOperationException("Not implemented yet.");
+ }
+
+ @Override
public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
// FIXME: implement.
throw new UnsupportedOperationException("Not yet implemented!");
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
index 54fc7794..38e0bf31 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,9 +33,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
*
* @author Erich Schubert
*
- * @param <O> Object type processed
+ * @param <V> Object type processed
*/
-public abstract class AbstractStreamNormalization<O extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<O, O> implements Normalization<O> {
+public abstract class AbstractStreamNormalization<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> implements Normalization<V> {
/**
* Initializes the option handler and the parameter map.
*/
@@ -44,12 +44,17 @@ public abstract class AbstractStreamNormalization<O extends NumberVector<?>> ext
}
@Override
- protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<O> in) {
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
initializeOutputType(in);
return in;
}
@Override
+ public V restore(V featureVector) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
// FIXME: implement.
throw new UnsupportedOperationException("Not yet implemented!");
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
index 0abaac95..d9002c93 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
index 3c3e7bdf..bf913852 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
/**
* Normalization performs a normalization on a set of feature vectors and is
@@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
*
* @param <O> object type
*/
-public interface Normalization<O> extends ObjectFilter, Parameterizable {
+public interface Normalization<O> extends ObjectFilter {
/**
* Transforms a feature vector to the original attribute ranges.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
deleted file mode 100644
index 09b73aa4..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import gnu.trove.map.hash.TIntDoubleHashMap;
-import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-
-/**
- * Perform full TF-IDF Normalization as commonly used in text mining.
- *
- * Each record is first normalized using "term frequencies" to sum up to 1. Then
- * it is globally normalized using the Inverse Document Frequency, so rare terms
- * are weighted stronger than common terms.
- *
- * Restore will only undo the IDF part of the normalization!
- *
- * @author Erich Schubert
- *
- * @param <V> Vector type
- */
-public class TFIDFNormalization<V extends SparseNumberVector<?>> extends InverseDocumentFrequencyNormalization<V> {
- /**
- * Class logger.
- */
- private static final Logging LOG = Logging.getLogger(TFIDFNormalization.class);
-
- /**
- * Constructor.
- */
- public TFIDFNormalization() {
- super();
- }
-
- @Override
- protected V filterSingleObject(V featureVector) {
- double sum = 0.0;
- for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) {
- sum += featureVector.iterDoubleValue(it);
- }
- if(sum <= 0) {
- sum = 1.0;
- }
- TIntDoubleHashMap vals = new TIntDoubleHashMap();
- for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) {
- final int dim = featureVector.iterDim(it);
- vals.put(dim, featureVector.iterDoubleValue(it) / sum * idf.get(dim));
- }
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java
new file mode 100644
index 00000000..a1618b9f
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java
@@ -0,0 +1,326 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.BetaDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta.BestFitEstimator;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParameter;
+
+/**
+ * Project the data using a Beta distribution.
+ *
+ * This is a crude heuristic, that may or may not work for your data set. There
+ * currently is no theoretical foundation of why it may be sensible or not to do
+ * this.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ * @apiviz.uses DistributionEstimator
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseBetaNormalization"})
+public class AttributeWiseBetaNormalization<V extends NumberVector> implements Normalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseBetaNormalization.class);
+
+ /**
+ * Stores the distribution estimators
+ */
+ private List<DistributionEstimator<?>> estimators;
+
+ /**
+ * Stores the estimated distributions
+ */
+ private List<Distribution> dists;
+
+ /**
+ * Number vector factory.
+ */
+ protected NumberVector.Factory<V> factory;
+
+ /**
+ * Expected outlier rate alpha.
+ */
+ protected double alpha = 0.01;
+
+ /**
+ * Constructor.
+ *
+ * @param estimators Distribution estimators
+ */
+ public AttributeWiseBetaNormalization(List<DistributionEstimator<?>> estimators, double alpha) {
+ super();
+ this.estimators = estimators;
+ this.alpha = alpha;
+ }
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ if(objects.dataLength() == 0) {
+ return objects;
+ }
+ for(int r = 0; r < objects.metaLength(); r++) {
+ SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
+ final List<?> column = (List<?>) objects.getColumn(r);
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ continue;
+ }
+ @SuppressWarnings("unchecked")
+ final List<V> castColumn = (List<V>) column;
+ // Get the replacement type information
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+ factory = FilterUtil.guessFactory(castType);
+
+ // Scan to find the best
+ final int dim = castType.getDimensionality();
+ dists = new ArrayList<>(dim);
+ // Scratch space for testing:
+ double[] test = new double[castColumn.size()];
+
+ // We iterate over dimensions, this kind of filter needs fast random
+ // access.
+ Adapter adapter = new Adapter();
+ for(int d = 0; d < dim; d++) {
+ adapter.dim = d;
+ if(estimators.size() == 1) {
+ dists.add(estimators.get(0).estimate(castColumn, adapter));
+ continue;
+ }
+ Distribution best = null;
+ double bestq = Double.POSITIVE_INFINITY;
+ trials: for(DistributionEstimator<?> est : estimators) {
+ try {
+ Distribution dist = est.estimate(castColumn, adapter);
+ for(int i = 0; i < test.length; i++) {
+ test[i] = dist.cdf(castColumn.get(i).doubleValue(d));
+ if(Double.isNaN(test[i])) {
+ LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ if(Double.isInfinite(test[i])) {
+ LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ }
+ Arrays.sort(test);
+ double q = KolmogorovSmirnovTest.simpleTest(test);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
+ }
+ if(best == null || q < bestq) {
+ best = dist;
+ bestq = q;
+ }
+ }
+ catch(ArithmeticException e) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
+ }
+ continue;
+ }
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
+ }
+ dists.add(best);
+ }
+
+ // Beta distribution for projection
+ double p = Math.pow(alpha, -1 / Math.sqrt(dim));
+ BetaDistribution beta = new BetaDistribution(p, p);
+ // Normalization scan
+ double[] buf = new double[dim];
+ for(int i = 0; i < objects.dataLength(); i++) {
+ final V obj = castColumn.get(i);
+ for(int d = 0; d < dim; d++) {
+ // TODO: when available, use logspace for better numerical precision!
+ buf[d] = beta.quantile(dists.get(d).cdf(obj.doubleValue(d)));
+ }
+ castColumn.set(i, factory.newNumberVector(buf));
+ }
+ }
+ return objects;
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization distributions: ");
+ boolean first = true;
+ for(DistributionEstimator<?> est : estimators) {
+ if(!first) {
+ result.append(',');
+ }
+ first = false;
+ result.append(est.getClass().getSimpleName());
+ }
+ return result.toString();
+ }
+
+ /**
+ * Array adapter class for vectors.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector>> {
+ /**
+ * Dimension to process.
+ */
+ int dim;
+
+ @Override
+ public int size(List<? extends NumberVector> array) {
+ return array.size();
+ }
+
+ @Override
+ public Double get(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return getDouble(array, off);
+ }
+
+ @Override
+ public double getDouble(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).doubleValue(dim);
+ }
+
+ @Override
+ public float getFloat(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).floatValue(dim);
+ }
+
+ @Override
+ public int getInteger(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).intValue(dim);
+ }
+
+ @Override
+ public short getShort(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).shortValue(dim);
+ }
+
+ @Override
+ public long getLong(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).longValue(dim);
+ }
+
+ @Override
+ public byte getByte(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).byteValue(dim);
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for distribution estimators.
+ */
+ public static final OptionID DISTRIBUTIONS_ID = new OptionID("betanormalize.distributions", "A list of the distribution estimators to try.");
+
+ /**
+ * Shape parameter.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("betanormalize.alpha", "Alpha parameter to control the shape of the output distribution.");
+
+ /**
+ * Stores the distribution estimators
+ */
+ private List<DistributionEstimator<?>> estimators;
+
+ /**
+ * Expected outlier rate alpha.
+ */
+ private double alpha;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectListParameter<DistributionEstimator<?>> estP = new ObjectListParameter<>(DISTRIBUTIONS_ID, DistributionEstimator.class);
+ List<Class<? extends DistributionEstimator<?>>> def = new ArrayList<>(1);
+ def.add(BestFitEstimator.class);
+ estP.setDefaultValue(def);
+ if(config.grab(estP)) {
+ estimators = estP.instantiateClasses(config);
+ }
+
+ DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.1);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+ }
+
+ @Override
+ protected AttributeWiseBetaNormalization<V> makeInstance() {
+ return new AttributeWiseBetaNormalization<>(estimators, alpha);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java
index dd86cc5a..be501b11 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,12 +33,16 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta.BestFitEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -66,7 +70,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParamet
* @apiviz.uses DistributionEstimator
*/
// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements Normalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseCDFNormalization"})
+public class AttributeWiseCDFNormalization<V extends NumberVector> implements Normalization<V> {
/**
* Class logger.
*/
@@ -85,7 +90,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
/**
* Number vector factory.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
/**
* Constructor.
@@ -99,13 +104,13 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (objects.dataLength() == 0) {
+ if(objects.dataLength() == 0) {
return objects;
}
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked")
@@ -119,60 +124,33 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
final int dim = castType.getDimensionality();
dists = new ArrayList<>(dim);
// Scratch space for testing:
- double[] test = new double[castColumn.size()];
+ double[] test = estimators.size() > 1 ? new double[castColumn.size()] : null;
// We iterate over dimensions, this kind of filter needs fast random
// access.
Adapter adapter = new Adapter();
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
adapter.dim = d;
- if (estimators.size() == 1) {
- dists.add(estimators.get(0).estimate(castColumn, adapter));
- } else {
- Distribution best = null;
- double bestq = Double.POSITIVE_INFINITY;
- trials: for (DistributionEstimator<?> est : estimators) {
- try {
- Distribution dist = est.estimate(castColumn, adapter);
- for (int i = 0; i < test.length; i++) {
- test[i] = dist.cdf(castColumn.get(i).doubleValue(d));
- if (Double.isNaN(test[i])) {
- LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
- continue trials;
- }
- if (Double.isInfinite(test[i])) {
- LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
- continue trials;
- }
- }
- Arrays.sort(test);
- double q = KolmogorovSmirnovTest.simpleTest(test);
- if (LOG.isVeryVerbose()) {
- LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
- }
- if (best == null || q < bestq) {
- best = dist;
- bestq = q;
- }
- } catch (ArithmeticException e) {
- if (LOG.isVeryVerbose()) {
- LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
- }
- continue;
- }
- }
- if (LOG.isVerbose()) {
- LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
- }
- dists.add(best);
+ Distribution dist;
+ if(estimators.size() == 1) {
+ dist = estimators.get(0).estimate(castColumn, adapter);
+ }
+ else {
+ dist = findBestFit(castColumn, adapter, d, test);
+ }
+ // Special handling for constant distributions:
+ // We want them to remain 0, instead of - usually - becoming constant .5
+ if(dist instanceof UniformDistribution) {
+ dist = constantZero(castColumn, adapter) ? new UniformDistribution(0., 1.) : dist;
}
+ dists.add(dist);
}
// Normalization scan
double[] buf = new double[dim];
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
buf[d] = dists.get(d).cdf(obj.doubleValue(d));
}
castColumn.set(i, factory.newNumberVector(buf));
@@ -181,6 +159,71 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
return objects;
}
+ /**
+ * Find the best fitting distribution.
+ *
+ * @param col Column of table
+ * @param adapter Adapter for accessing the data
+ * @param d Dimension
+ * @param test Scatch space for testing goodness of fit
+ * @return Best fit distribution
+ */
+ protected Distribution findBestFit(final List<V> col, Adapter adapter, int d, double[] test) {
+ Distribution best = null;
+ double bestq = Double.POSITIVE_INFINITY;
+ trials: for(DistributionEstimator<?> est : estimators) {
+ try {
+ Distribution dist = est.estimate(col, adapter);
+ for(int i = 0; i < test.length; i++) {
+ test[i] = dist.cdf(col.get(i).doubleValue(d));
+ if(Double.isNaN(test[i])) {
+ LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ if(Double.isInfinite(test[i])) {
+ LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ }
+ Arrays.sort(test);
+ double q = KolmogorovSmirnovTest.simpleTest(test);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
+ }
+ if(best == null || q < bestq) {
+ best = dist;
+ bestq = q;
+ }
+ }
+ catch(ArithmeticException e) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
+ }
+ continue trials;
+ }
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
+ }
+ return best;
+ }
+
+ /**
+ * Test if an attribute is constant zero.
+ *
+ * @param column Column
+ * @param adapter Data accessor.
+ * @return {@code true} if all values are zero
+ */
+ protected boolean constantZero(List<V> column, Adapter adapter) {
+ for(int i = 0, s = adapter.size(column); i < s; i++) {
+ if(adapter.get(column, i) != 0.) {
+ return false;
+ }
+ }
+ return true;
+ }
+
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
@@ -198,8 +241,8 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
result.append('\n');
result.append("normalization distributions: ");
boolean first = true;
- for (DistributionEstimator<?> est : estimators) {
- if (!first) {
+ for(DistributionEstimator<?> est : estimators) {
+ if(!first) {
result.append(',');
}
first = false;
@@ -212,52 +255,52 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
* Array adapter class for vectors.
*
* @author Erich Schubert
- *
+ *
* @apiviz.exclude
*/
- private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector<?>>> {
+ private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector>> {
/**
* Dimension to process.
*/
int dim;
@Override
- public int size(List<? extends NumberVector<?>> array) {
+ public int size(List<? extends NumberVector> array) {
return array.size();
}
@Override
- public Double get(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public Double get(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return getDouble(array, off);
}
@Override
- public double getDouble(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public double getDouble(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).doubleValue(dim);
}
@Override
- public float getFloat(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public float getFloat(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).floatValue(dim);
}
@Override
- public int getInteger(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public int getInteger(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).intValue(dim);
}
@Override
- public short getShort(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public short getShort(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).shortValue(dim);
}
@Override
- public long getLong(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public long getLong(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).longValue(dim);
}
@Override
- public byte getByte(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public byte getByte(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).byteValue(dim);
}
}
@@ -269,7 +312,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Parameter for distribution estimators.
*/
@@ -287,7 +330,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
List<Class<? extends DistributionEstimator<?>>> def = new ArrayList<>(1);
def.add(BestFitEstimator.class);
estP.setDefaultValue(def);
- if (config.grab(estP)) {
+ if(config.grab(estP)) {
estimators = estP.instantiateClasses(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java
index 9a263171..e4af3a92 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
* Attribute-wise Normalization using the error function. This mostly makes
@@ -35,11 +37,12 @@ import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
*
* @author Erich Schubert
*
- * @param <O> Object type
+ * @param <V> Object type
*
* @apiviz.uses NumberVector
*/
-public class AttributeWiseErfNormalization<O extends NumberVector<?>> extends AbstractNormalization<O> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseErfNormalization"})
+public class AttributeWiseErfNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
@@ -53,26 +56,21 @@ public class AttributeWiseErfNormalization<O extends NumberVector<?>> extends Ab
}
@Override
- public O restore(O featureVector) {
- throw new UnsupportedOperationException("Not implemented yet.");
- }
-
- @Override
- protected O filterSingleObject(O obj) {
+ protected V filterSingleObject(V obj) {
double[] val = new double[obj.getDimensionality()];
- for (int i = 0; i < val.length; i++) {
+ for(int i = 0; i < val.length; i++) {
val[i] = NormalDistribution.erf(obj.doubleValue(i));
}
return factory.newNumberVector(val);
}
@Override
- protected SimpleTypeInformation<? super O> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ protected Logging getLogger() {
+ return LOG;
}
@Override
- protected Logging getLogger() {
- return LOG;
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
index 8c4f15e1..ec50aadd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,10 +31,13 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
@@ -54,7 +57,8 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
* @apiviz.uses NumberVector
*/
// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements Normalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMADNormalization"})
+public class AttributeWiseMADNormalization<V extends NumberVector> implements Normalization<V> {
/**
* Class logger.
*/
@@ -63,7 +67,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
/**
* Number vector factory.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
/**
* Stores the median in each dimension.
@@ -71,9 +75,9 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
private double[] median = new double[0];
/**
- * Stores the median absolute deviation in each dimension.
+ * Stores the inverse median absolute deviation in each dimension.
*/
- private double[] madsigma = new double[0];
+ private double[] imadsigma = new double[0];
/**
* Constructor.
@@ -84,13 +88,13 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (objects.dataLength() == 0) {
+ if(objects.dataLength() == 0) {
return objects;
}
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked")
@@ -103,61 +107,72 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
// Scan to find the best
final int dim = castType.getDimensionality();
median = new double[dim];
- madsigma = new double[dim];
+ imadsigma = new double[dim];
// Scratch space for testing:
double[] test = new double[castColumn.size()];
FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data.", dim, LOG) : null;
// We iterate over dimensions, this kind of filter needs fast random
// access.
- for (int d = 0; d < dim; d++) {
- for (int i = 0; i < test.length; i++) {
+ for(int d = 0; d < dim; d++) {
+ for(int i = 0; i < test.length; i++) {
test[i] = castColumn.get(i).doubleValue(d);
}
final double med = QuickSelect.median(test);
median[d] = med;
- for (int i = 0; i < test.length; i++) {
+ int zeros = 0;
+ for(int i = 0; i < test.length; i++) {
test[i] = Math.abs(test[i] - med);
+ if(test[i] == 0.) {
+ zeros++;
+ }
}
// Rescale the true MAD for the best standard deviation estimate:
- madsigma[d] = QuickSelect.median(test) * NormalDistribution.ONEBYPHIINV075;
- if (dprog != null) {
- dprog.incrementProcessed(LOG);
+ if(zeros < (test.length >>> 1)) {
+ imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
}
+ else if(zeros == test.length) {
+ LOG.warning("Constant attribute detected. Using MAD=1.");
+ imadsigma[d] = 1.; // Does not matter. Constant distribution.
+ }
+ else {
+ // We have more than 50% zeros, so the regular MAD estimate does not
+ // work. Generalize the MAD approach to use the 50% non-zero value:
+ final int rank = zeros + ((test.length - zeros) >> 1);
+ final double rel = .5 + rank * .5 / test.length;
+ imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
+ LOG.warning("Near-constant attribute detected. Using modified MAD.");
+ }
+ LOG.incrementProcessed(dprog);
}
- if (dprog != null) {
- dprog.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(dprog);
FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), LOG) : null;
// Normalization scan
double[] buf = new double[dim];
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
buf[d] = normalize(d, obj.doubleValue(d));
}
castColumn.set(i, factory.newNumberVector(buf));
- if (nprog != null) {
- nprog.incrementProcessed(LOG);
- }
- }
- if (nprog != null) {
- nprog.ensureCompleted(LOG);
+ LOG.incrementProcessed(nprog);
}
+ LOG.ensureCompleted(nprog);
}
return objects;
}
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if (featureVector.getDimensionality() == median.length) {
+ if(featureVector.getDimensionality() == median.length) {
double[] values = new double[featureVector.getDimensionality()];
- for (int d = 0; d < featureVector.getDimensionality(); d++) {
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
values[d] = restore(d, featureVector.doubleValue(d));
}
return factory.newNumberVector(values);
- } else {
+ }
+ else {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + median.length);
}
}
@@ -175,7 +190,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
* @return Normalized value
*/
private double normalize(int d, double val) {
- return (val - median[d]) / madsigma[d];
+ return (val - median[d]) * imadsigma[d];
}
/**
@@ -186,7 +201,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
* @return Normalized value
*/
private double restore(int d, double val) {
- return (val * madsigma[d]) + median[d];
+ return (val / imadsigma[d]) + median[d];
}
@Override
@@ -196,7 +211,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
result.append('\n');
result.append("normalization median: ").append(FormatUtil.format(median));
result.append('\n');
- result.append("normalization MAD sigma: ").append(FormatUtil.format(madsigma));
+ result.append("normalization scaling factor: ").append(FormatUtil.format(imadsigma));
return result.toString();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java
new file mode 100644
index 00000000..1039ab5b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java
@@ -0,0 +1,207 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+
+/**
+ * Normalization designed for data with a <em>meaningful zero</em>: Each
+ * attribute is scaled to have the same mean (but 0 is not changed).
+ *
+ * @author Erich Schubert
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMeanNormalization"})
+public class AttributeWiseMeanNormalization<V extends NumberVector> extends AbstractNormalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseMeanNormalization.class);
+
+ /**
+ * Stores the mean in each dimension.
+ */
+ private double[] mean = null;
+
+ /**
+ * Temporary storage used during initialization.
+ */
+ double[] sums = null;
+
+ /**
+ * Count the number of values seen.
+ */
+ int c = 0;
+
+ /**
+ * Constructor.
+ *
+ * @param mean Mean value
+ */
+ public AttributeWiseMeanNormalization(double[] mean) {
+ super();
+ this.mean = mean;
+ }
+
+ /**
+ * Constructor.
+ */
+ public AttributeWiseMeanNormalization() {
+ super();
+ }
+
+ @Override
+ protected boolean prepareStart(SimpleTypeInformation<V> in) {
+ return (mean == null || mean.length == 0);
+ }
+
+ @Override
+ protected void prepareProcessInstance(V featureVector) {
+ // First object? Then init. (We didn't have a dimensionality before!)
+ if(sums == null || sums.length == 0) {
+ int dimensionality = featureVector.getDimensionality();
+ sums = new double[dimensionality];
+ }
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ sums[d] += featureVector.doubleValue(d);
+ }
+ ++c;
+ }
+
+ @Override
+ protected void prepareComplete() {
+ StringBuilder buf = LOG.isVerbose() ? new StringBuilder() : null;
+ final int dimensionality = sums.length;
+ mean = new double[dimensionality];
+ if(buf != null) {
+ buf.append("Normalization parameters: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ mean[d] = sums[d] / c;
+ if(buf != null) {
+ buf.append(" m: ").append(mean[d]);
+ }
+ }
+ sums = null;
+ if(buf != null) {
+ LOG.debugFine(buf.toString());
+ }
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = normalize(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ if(featureVector.getDimensionality() != mean.length) {
+ throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
+ }
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+
+ /**
+ * Normalize a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double normalize(int d, double val) {
+ d = (mean.length == 1) ? 0 : d;
+ return val / mean[d];
+ }
+
+ /**
+ * Restore a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double restore(int d, double val) {
+ d = (mean.length == 1) ? 0 : d;
+ return val * mean[d];
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ double[][] coeff = linearEquationSystem.getCoefficents();
+ double[] rhs = linearEquationSystem.getRHS();
+ int[] row = linearEquationSystem.getRowPermutations();
+ int[] col = linearEquationSystem.getColumnPermutations();
+
+ for(int i = 0; i < coeff.length; i++) {
+ for(int r = 0; r < coeff.length; r++) {
+ double sum = 0.0;
+ for(int c = 0; c < coeff[0].length; c++) {
+ sum += coeff[row[r]][col[c]] / mean[c];
+ coeff[row[r]][col[c]] = coeff[row[r]][col[c]] / mean[c];
+ }
+ rhs[row[r]] = rhs[row[r]] + sum;
+ }
+ }
+
+ return new LinearEquationSystem(coeff, rhs, row, col);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization means: ").append(FormatUtil.format(mean));
+
+ return result.toString();
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java
index 47b6db5f..26a125ad 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,11 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -46,24 +49,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParamet
*
* @apiviz.uses NumberVector
*/
-// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMinMaxNormalization"})
+public class AttributeWiseMinMaxNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(AttributeWiseMinMaxNormalization.class);
/**
- * Parameter for minimum.
- */
- public static final OptionID MINIMA_ID = new OptionID("normalize.min", "a comma separated concatenation of the minimum values in each dimension that are mapped to 0. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
-
- /**
- * Parameter for maximum.
- */
- public static final OptionID MAXIMA_ID = new OptionID("normalize.max", "a comma separated concatenation of the maximum values in each dimension that are mapped to 1. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
-
- /**
* Stores the maximum in each dimension.
*/
private double[] maxima = new double[0];
@@ -130,16 +123,14 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if(featureVector.getDimensionality() == maxima.length && featureVector.getDimensionality() == minima.length) {
- double[] values = new double[featureVector.getDimensionality()];
- for(int d = 0; d < featureVector.getDimensionality(); d++) {
- values[d] = (featureVector.doubleValue(d) * (factor(d)) + minima[d]);
- }
- return factory.newNumberVector(values);
- }
- else {
+ if(featureVector.getDimensionality() != maxima.length || featureVector.getDimensionality() != minima.length) {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + maxima.length);
}
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = featureVector.doubleValue(d) * factor(d) + minima[d];
+ }
+ return factory.newNumberVector(values);
}
/**
@@ -174,8 +165,7 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
}
}
- LinearEquationSystem lq = new LinearEquationSystem(coeff, rhs, row, col);
- return lq;
+ return new LinearEquationSystem(coeff, rhs, row, col);
}
@Override
@@ -190,13 +180,13 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
}
@Override
- protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ protected Logging getLogger() {
+ return LOG;
}
@Override
- protected Logging getLogger() {
- return LOG;
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
}
/**
@@ -206,7 +196,17 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for minimum.
+ */
+ public static final OptionID MINIMA_ID = new OptionID("normalize.min", "a comma separated concatenation of the minimum values in each dimension that are mapped to 0. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for maximum.
+ */
+ public static final OptionID MAXIMA_ID = new OptionID("normalize.max", "a comma separated concatenation of the maximum values in each dimension that are mapped to 1. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
+
/**
* Stores the maximum in each dimension.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java
index a24cae25..a7241441 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,9 +26,12 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -48,32 +51,22 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParamet
*
* @apiviz.uses NumberVector
*/
-// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseVarianceNormalization", "z" })
+public class AttributeWiseVarianceNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(AttributeWiseVarianceNormalization.class);
/**
- * Parameter for means.
- */
- public static final OptionID MEAN_ID = new OptionID("normalize.mean", "a comma separated concatenation of the mean values in each dimension that are mapped to 0. If no value is specified, the mean value of the attribute range in this dimension will be taken.");
-
- /**
- * Parameter for stddevs.
- */
- public static final OptionID STDDEV_ID = new OptionID("normalize.stddev", "a comma separated concatenation of the standard deviations in each dimension that are scaled to 1. If no value is specified, the standard deviation of the attribute range in this dimension will be taken.");
-
- /**
* Stores the mean in each dimension.
*/
- private double[] mean = new double[0];
+ private double[] mean;
/**
* Stores the standard deviation in each dimension.
*/
- private double[] stddev = new double[0];
+ private double[] stddev;
/**
* Temporary storage used during initialization.
@@ -152,16 +145,14 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if(featureVector.getDimensionality() == mean.length) {
- double[] values = new double[featureVector.getDimensionality()];
- for(int d = 0; d < featureVector.getDimensionality(); d++) {
- values[d] = restore(d, featureVector.doubleValue(d));
- }
- return factory.newNumberVector(values);
- }
- else {
+ if(featureVector.getDimensionality() != mean.length) {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
}
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
}
/**
@@ -172,12 +163,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
* @return Normalized value
*/
private double normalize(int d, double val) {
- if(mean.length == 1) {
- return (val - mean[0]) / stddev[0];
- }
- else {
- return (val - mean[d]) / stddev[d];
- }
+ d = (mean.length == 1) ? 0 : d;
+ return (val - mean[d]) / stddev[d];
}
/**
@@ -188,12 +175,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
* @return Normalized value
*/
private double restore(int d, double val) {
- if(mean.length == 1) {
- return (val * stddev[0]) + mean[0];
- }
- else {
- return (val * stddev[d]) + mean[d];
- }
+ d = (mean.length == 1) ? 0 : d;
+ return (val * stddev[d]) + mean[d];
}
@Override
@@ -214,13 +197,7 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
}
}
- LinearEquationSystem lq = new LinearEquationSystem(coeff, rhs, row, col);
- return lq;
- }
-
- @Override
- protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ return new LinearEquationSystem(coeff, rhs, row, col);
}
@Override
@@ -240,6 +217,11 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
return LOG;
}
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+
/**
* Parameterization class.
*
@@ -247,7 +229,17 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for means.
+ */
+ public static final OptionID MEAN_ID = new OptionID("normalize.mean", "a comma separated concatenation of the mean values in each dimension that are mapped to 0. If no value is specified, the mean value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for stddevs.
+ */
+ public static final OptionID STDDEV_ID = new OptionID("normalize.stddev", "a comma separated concatenation of the standard deviations in each dimension that are scaled to 1. If no value is specified, the standard deviation of the attribute range in this dimension will be taken.");
+
/**
* Stores the mean in each dimension.
*/
@@ -261,22 +253,22 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- DoubleListParameter meanP = new DoubleListParameter(MEAN_ID, true);
- DoubleListParameter stddevP = new DoubleListParameter(STDDEV_ID, true);
- config.grab(meanP);
- config.grab(stddevP);
- // Note: grab first, then use isDefined, to ensure the stddev is grabbed.
- if(meanP.isDefined() && stddevP.isDefined()) {
+ DoubleListParameter meanP = new DoubleListParameter(MEAN_ID) //
+ .setOptional(true);
+ if(config.grab(meanP)) {
mean = ArrayLikeUtil.toPrimitiveDoubleArray(meanP.getValue());
+ }
+ DoubleListParameter stddevP = new DoubleListParameter(STDDEV_ID) //
+ .setOptional(true);
+ if(config.grab(stddevP)) {
stddev = ArrayLikeUtil.toPrimitiveDoubleArray(stddevP.getValue());
for(double d : stddev) {
- if(d == 0) {
+ if(d == 0.) {
config.reportError(new WrongParameterValueException("Standard deviations must not be 0."));
}
}
}
-
config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(meanP, stddevP));
config.checkConstraint(new EqualSizeGlobalConstraint(meanP, stddevP));
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java
index bb9c2aec..ca320ec6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
*/
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.IntegerVector;
@@ -34,21 +33,25 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
/**
* Normalize vectors according to their rank in the attributes.
*
- * Note: ranks are multiplied by 2, to be able to give ties an integer rank.
- * (e.g. first two records are tied at "1" then, followed by the next on "4")
+ * Note: <b>ranks are multiplied by 2</b>, to be able to give ties an integer
+ * rank. (e.g. when the first two records are tied, they both have rank "1"
+ * then, followed by the next on "4")
*
* @author Erich Schubert
*/
-public class RankTieNormalization implements ObjectFilter {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.IntegerRankTieNormalization"})
+public class IntegerRankTieNormalization implements ObjectFilter {
/**
* Constructor.
*/
- public RankTieNormalization() {
+ public IntegerRankTieNormalization() {
super();
}
@@ -57,6 +60,12 @@ public class RankTieNormalization implements ObjectFilter {
final int len = objects.dataLength();
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+ int[] order = new int[len];
+ for(int i = 0; i < len; i++) {
+ order[i] = i;
+ }
+ Sorter comparator = new Sorter();
+
for(int r = 0; r < objects.metaLength(); r++) {
final SimpleTypeInformation<?> type = objects.meta(r);
final List<?> column = objects.getColumn(r);
@@ -65,7 +74,7 @@ public class RankTieNormalization implements ObjectFilter {
continue;
}
@SuppressWarnings("unchecked")
- final List<? extends NumberVector<?>> castColumn = (List<? extends NumberVector<?>>) column;
+ final List<? extends NumberVector> castColumn = (List<? extends NumberVector>) column;
// Get the replacement type information
final int dim = ((VectorFieldTypeInformation<?>) type).getDimensionality();
final VectorFieldTypeInformation<IntegerVector> outType = new VectorFieldTypeInformation<>(IntegerVector.STATIC, dim);
@@ -73,29 +82,21 @@ public class RankTieNormalization implements ObjectFilter {
// Output vectors
int[][] posvecs = new int[len][dim];
// Sort for each dimension
- // TODO: an int[] array would be enough, if we could use a comparator...
- DoubleIntPair[] sorter = new DoubleIntPair[len];
- for(int i = 0; i < sorter.length; i++) {
- sorter[i] = new DoubleIntPair(Double.NaN, -1);
- }
for(int d = 0; d < dim; d++) {
- // fill array
- for(int i = 0; i < sorter.length; i++) {
- sorter[i].first = castColumn.get(i).doubleValue(d);
- sorter[i].second = i;
- }
// Sort
- Arrays.sort(sorter);
+ comparator.setup(castColumn, d);
+ IntegerArrayQuickSort.sort(order, comparator);
// Transfer positions to output vectors
- for(int sta = 0; sta < sorter.length;) {
+ for(int sta = 0; sta < order.length;) {
+ double v = castColumn.get(order[sta]).doubleValue(d);
// Compute ties
int end = sta + 1;
- while(end < sorter.length && !(sorter[sta].first < sorter[end].first)) {
+ while(end < order.length && !(v < castColumn.get(order[end]).doubleValue(d))) {
end++;
}
final int pos = (sta + end - 1);
for(int i = sta; i < end; i++) {
- posvecs[sorter[i].second][d] = pos;
+ posvecs[order[i]][d] = pos;
}
sta = end;
}
@@ -110,4 +111,40 @@ public class RankTieNormalization implements ObjectFilter {
}
return bundle;
}
+
+ /**
+ * Class to sort an index array by a particular dimension.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Sorter implements IntegerComparator {
+ /**
+ * Column to use for sorting.
+ */
+ List<? extends NumberVector> col;
+
+ /**
+ * Dimension to use for sorting.
+ */
+ int dim;
+
+ /**
+ * Configure the sorting class.
+ *
+ * @param col Column to read
+ * @param dim Dimension to use.
+ */
+ public void setup(List<? extends NumberVector> col, int dim) {
+ this.col = col;
+ this.dim = dim;
+ }
+
+ @Override
+ public int compare(int x, int y) {
+ final double vx = col.get(x).doubleValue(dim), vy = col.get(y).doubleValue(dim);
+ return (vx < vy) ? -1 : (vx == vy) ? 0 : +1;
+ }
+ }
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java
index 21263890..99054f83 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,11 +29,13 @@ import gnu.trove.map.hash.TIntDoubleHashMap;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
- * Normalization for text frequency vectors, using the inverse document
- * frequency.
+ * Normalization for text frequency (TF) vectors, using the inverse document
+ * frequency (IDF). See also: TF-IDF for text analysis.
*
* @author Erich Schubert
*
@@ -41,7 +43,8 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
*
* @param <V> Vector type
*/
-public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.InverseDocumentFrequencyNormalization" })
+public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
@@ -102,7 +105,7 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<
final int dim = featureVector.iterDim(it);
vals.put(dim, featureVector.iterDoubleValue(it) * idf.get(dim));
}
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
+ return ((SparseNumberVector.Factory<V>) factory).newNumberVector(vals, featureVector.getDimensionality());
}
@Override
@@ -112,12 +115,12 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<
final int dim = featureVector.iterDim(it);
vals.put(dim, featureVector.iterDoubleValue(it) / idf.get(dim));
}
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
+ return ((SparseNumberVector.Factory<V>) factory).newNumberVector(vals, featureVector.getDimensionality());
}
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.SPARSE_VECTOR_FIELD;
+ return TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH;
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java
new file mode 100644
index 00000000..f1fac885
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Normalizations operating on columns / variates; where each column is treated independently.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java
new file mode 100644
index 00000000..b2da96a9
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java
@@ -0,0 +1,97 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Normalize histograms by scaling them to L1 norm 1, then taking the square
+ * root in each attribute.
+ *
+ * Using Euclidean distance (linear kernel) and this transformation is the same
+ * as using Hellinger distance:
+ * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.probabilistic.HellingerDistanceFunction}
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.HellingerHistogramNormalization" })
+public class HellingerHistogramNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Static instance.
+ */
+ public static final HellingerHistogramNormalization<NumberVector> STATIC = new HellingerHistogramNormalization<>();
+
+ /**
+ * Constructor.
+ */
+ public HellingerHistogramNormalization() {
+ super();
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] data = new double[featureVector.getDimensionality()];
+ double sum = 0.;
+ for(int d = 0; d < data.length; ++d) {
+ data[d] = featureVector.doubleValue(d);
+ data[d] = data[d] > 0 ? data[d] : -data[d];
+ sum += data[d];
+ }
+ // Normalize and sqrt:
+ if(sum > 0.) {
+ for(int d = 0; d < data.length; ++d) {
+ if(data[d] > 0) {
+ data[d] = Math.sqrt(data[d] / sum);
+ }
+ }
+ }
+ return factory.newNumberVector(data);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected HellingerHistogramNormalization<NumberVector> makeInstance() {
+ return STATIC;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java
new file mode 100644
index 00000000..05485909
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java
@@ -0,0 +1,159 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Normalize vectors such that they have zero mean and unit variance.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class InstanceMeanVarianceNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Multiplicity of the vector.
+ */
+ private int multiplicity;
+
+ /**
+ * Constructor.
+ */
+ public InstanceMeanVarianceNormalization() {
+ super();
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] raw = featureVector.getColumnVector().getArrayRef();
+ if(raw.length == 0) {
+ return factory.newNumberVector(new double[] {});
+ }
+ if(raw.length == 1) {
+ // Constant, but preserve NaNs
+ return factory.newNumberVector(new double[] { raw[0] == raw[0] ? 0. : Double.NaN });
+ }
+ // Multivariate codepath:
+ if(multiplicity > 1) {
+ assert (raw.length % multiplicity == 0) : "Vector length is not divisible by multiplicity?";
+ return factory.newNumberVector(multivariateStandardization(raw));
+ }
+ return factory.newNumberVector(univariateStandardization(raw));
+ }
+
+ protected double[] univariateStandardization(double[] raw) {
+ // Two pass normalization is numerically most stable,
+ // And Java should optimize this well enough.
+ double sum = 0.;
+ for(int i = 0; i < raw.length; ++i) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ sum += v;
+ }
+ final double mean = sum / raw.length;
+ double ssum = 0.;
+ for(int i = 0; i < raw.length; ++i) {
+ double v = raw[i] - mean;
+ if(v != v) {
+ continue;
+ }
+ ssum += v * v;
+ }
+ final double std = Math.sqrt(ssum) / (raw.length - 1);
+ if(std > 0.) {
+ for(int i = 0; i < raw.length; ++i) {
+ raw[i] = (raw[i] - mean) / std;
+ }
+ }
+ return raw;
+ }
+
+ protected double[] multivariateStandardization(double[] raw) {
+ final int len = raw.length / multiplicity;
+ if(len <= 1) {
+ return raw;
+ }
+ // Two pass normalization is numerically most stable,
+ // And Java should optimize this well enough.
+ double[] mean = new double[multiplicity];
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mean[j] += v;
+ }
+ for(int j = 0; j < multiplicity; ++j) {
+ mean[j] /= len;
+ }
+ double[] std = new double[multiplicity];
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ double v = raw[i] - mean[j];
+ if(v != v) {
+ continue;
+ }
+ std[j] += v * v;
+ }
+ for(int j = 0; j < multiplicity; ++j) {
+ std[j] = std[j] > 0. ? Math.sqrt(std[j]) / (len - 1) : 1;
+ }
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ raw[i] = (raw[i] - mean[j]) / std[j];
+ }
+ return raw;
+ }
+
+ @Override
+ protected void initializeOutputType(SimpleTypeInformation<V> type) {
+ super.initializeOutputType(type);
+ multiplicity = ((VectorTypeInformation<?>) type).getMultiplicity();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ @Override
+ protected InstanceMeanVarianceNormalization<V> makeInstance() {
+ return new InstanceMeanVarianceNormalization<>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java
new file mode 100644
index 00000000..9f8f7680
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java
@@ -0,0 +1,177 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+
+/**
+ * Normalize vectors such that the smallest attribute is 0, the largest is 1.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class InstanceMinMaxNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Minimum and maximum values.
+ */
+ private double min, max;
+
+ /**
+ * Multiplicity of the vector.
+ */
+ private int multiplicity;
+
+ /**
+ * Constructor.
+ *
+ * @param min Desired minimum value
+ * @param max Desired maximum value
+ */
+ public InstanceMinMaxNormalization(double min, double max) {
+ super();
+ this.min = min;
+ this.max = max;
+ }
+
+ /**
+ * Constructor, normalizing to {@code [0;1]}
+ */
+ public InstanceMinMaxNormalization() {
+ this(0., 1.);
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] raw = featureVector.getColumnVector().getArrayRef();
+ // Multivariate codepath:
+ if(multiplicity > 1) {
+ assert (raw.length % multiplicity == 0) : "Vector length is not divisible by multiplicity?";
+ double[] mi = new double[multiplicity], ma = new double[multiplicity];
+ for(int i = 0; i < multiplicity; i++) {
+ mi[i] = Double.POSITIVE_INFINITY;
+ ma[i] = Double.NEGATIVE_INFINITY;
+ }
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mi[j] = (mi[j] < v) ? mi[j] : v;
+ ma[j] = (ma[j] > v) ? ma[j] : v;
+ }
+ for(int j = 0; j < multiplicity; j++) {
+ if(mi[j] < ma[j]) {
+ final double s = (max - min) / (ma[j] - mi[j]);
+ for(int i = 0; i < raw.length; i += multiplicity) {
+ raw[i] = (raw[i] - mi[j]) * s + min;
+ }
+ }
+ }
+ return factory.newNumberVector(raw);
+ }
+ // Default codepath
+ double mi = Double.POSITIVE_INFINITY, ma = Double.NEGATIVE_INFINITY;
+ for(int i = 0; i < raw.length; ++i) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mi = (mi < v) ? mi : v;
+ ma = (ma > v) ? ma : v;
+ }
+ if(mi < ma) {
+ final double s = (max - min) / (ma - mi);
+ for(int i = 0; i < raw.length; ++i) {
+ raw[i] = (raw[i] - mi) * s + min;
+ }
+ }
+ return factory.newNumberVector(raw);
+ }
+
+ @Override
+ protected void initializeOutputType(SimpleTypeInformation<V> type) {
+ super.initializeOutputType(type);
+ multiplicity = ((VectorTypeInformation<?>) type).getMultiplicity();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Option ID for minimum value.
+ */
+ public static final OptionID MIN_ID = new OptionID("normalization.min", "Minimum value to assign to objects.");
+
+ /**
+ * Option ID for maximum value.
+ */
+ public static final OptionID MAX_ID = new OptionID("normalization.max", "Maximum value to assign to objects.");
+
+ /**
+ * Minimum and maximum values.
+ */
+ private double min, max;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ DoubleParameter minP = new DoubleParameter(MIN_ID, 0.) //
+ .setOptional(true);
+ if(config.grab(minP)) {
+ min = minP.doubleValue();
+ }
+ DoubleParameter maxP = new DoubleParameter(MAX_ID, 1.) //
+ .setOptional(true);
+ if(config.grab(maxP)) {
+ max = maxP.doubleValue();
+ }
+ config.checkConstraint(new LessGlobalConstraint<>(minP, maxP));
+ }
+
+ @Override
+ protected InstanceMinMaxNormalization<V> makeInstance() {
+ return new InstanceMinMaxNormalization<>(min, max);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java
index a12dea3b..51b2a34b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,9 +26,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.Norm;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -42,42 +43,32 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> vector type
*/
-public class LengthNormalization<V extends NumberVector<?>> extends AbstractStreamNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.LengthNormalization"})
+public class LengthNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
/**
* Norm to use.
*/
- DoubleNorm<? super V> norm;
+ Norm<? super V> norm;
/**
* Constructor.
*
* @param norm Norm to use
*/
- public LengthNormalization(DoubleNorm<? super V> norm) {
+ public LengthNormalization(Norm<? super V> norm) {
super();
this.norm = norm;
}
@Override
protected V filterSingleObject(V featureVector) {
- final double d = norm.doubleNorm(featureVector);
+ final double d = norm.norm(featureVector);
return factory.newNumberVector(featureVector.getColumnVector().timesEquals(1 / d).getArrayRef());
}
@Override
- public V restore(V featureVector) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
- // TODO.
- throw new UnsupportedOperationException();
- }
-
- @Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
}
/**
@@ -87,7 +78,7 @@ public class LengthNormalization<V extends NumberVector<?>> extends AbstractStre
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Option ID for normalization norm.
*/
@@ -96,12 +87,12 @@ public class LengthNormalization<V extends NumberVector<?>> extends AbstractStre
/**
* Norm to use.
*/
- DoubleNorm<? super V> norm;
+ Norm<? super V> norm;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- ObjectParameter<DoubleNorm<? super V>> normP = new ObjectParameter<>(NORM_ID, DoubleNorm.class, EuclideanDistanceFunction.class);
+ ObjectParameter<Norm<? super V>> normP = new ObjectParameter<>(NORM_ID, Norm.class, EuclideanDistanceFunction.class);
if(config.grab(normP)) {
norm = normP.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java
new file mode 100644
index 00000000..8970e7ef
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java
@@ -0,0 +1,119 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+
+/**
+ * Normalize the data set by applying log(1+|x|*b)/log(b+1) to any value. If the
+ * input data was in [0;1], then the resulting values will be in the same range.
+ *
+ * By default b=1, and thus the transformation is log2(1+|x|).
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.Log1PlusNormalization" })
+public class Log1PlusNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Static instance.
+ */
+ public static final Log1PlusNormalization<NumberVector> STATIC = new Log1PlusNormalization<>(1.);
+
+ /**
+ * Boosting factor, and scaling coefficient.
+ */
+ protected double boost, scale;
+
+ /**
+ * Constructor.
+ *
+ * @param boost Boosting parameter
+ */
+ public Log1PlusNormalization(double boost) {
+ super();
+ this.boost = boost;
+ this.scale = 1. / Math.log1p(boost);
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] data = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < data.length; ++d) {
+ data[d] = featureVector.doubleValue(d);
+ data[d] = Math.log1p((data[d] > 0 ? data[d] : -data[d]) * boost) * scale;
+ }
+ return factory.newNumberVector(data);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Boosting factor parameter.
+ */
+ public static final OptionID BOOST_ID = new OptionID("log1pscale.boost", "Boosting factor. Larger values will yield a steeper curve.");
+
+ /**
+ * Boosting factor.
+ */
+ protected double boost;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ DoubleParameter boostP = new DoubleParameter(BOOST_ID, 1.) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(boostP)) {
+ boost = boostP.doubleValue();
+ }
+ }
+
+ @Override
+ protected Log1PlusNormalization<V> makeInstance() {
+ return new Log1PlusNormalization<>(boost);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java
new file mode 100644
index 00000000..9ac613c0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Instancewise normalization, where each instance is normalized independently.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
index 15d689d7..552d7003 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
index 87684499..249c3764 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
@@ -2,12 +2,13 @@
* <p>Data filtering, in particular for normalization and projection.</p>
*
* @apiviz.exclude de.lmu.ifi.dbs.elki.utilities.*
+ * @apiviz.exclude de.lmu.ifi.dbs.elki.datasource.filter\.(normalization|transform)\.*
*/
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ByLabelFilter.java
index 66707da6..8683ca8c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ByLabelFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,12 +23,15 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -36,12 +39,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
/**
- * A filter to sort the data set by some label.
+ * A filter to select data set by their label.
*
* @author Erich Schubert
*
* @apiviz.uses LabelList oneway - - «reads»
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ByLabelFilter" })
public class ByLabelFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -49,9 +53,9 @@ public class ByLabelFilter extends AbstractStreamFilter {
private static final Logging LOG = Logging.getLogger(ByLabelFilter.class);
/**
- * The filter pattern
+ * The filter pattern matcher
*/
- private final Pattern pattern;
+ private final Matcher matcher;
/**
* Inversion flag
@@ -71,7 +75,7 @@ public class ByLabelFilter extends AbstractStreamFilter {
*/
public ByLabelFilter(Pattern pattern, boolean inverted) {
super();
- this.pattern = pattern;
+ this.matcher = pattern.matcher("");
this.inverted = inverted;
}
@@ -91,7 +95,7 @@ public class ByLabelFilter extends AbstractStreamFilter {
Event ev = source.nextEvent();
switch(ev){
case END_OF_STREAM:
- if (lblcol < 0) {
+ if(lblcol < 0) {
LOG.warning("By label filter was used, but never saw a label relation!");
}
return Event.END_OF_STREAM;
@@ -114,7 +118,8 @@ public class ByLabelFilter extends AbstractStreamFilter {
boolean good = false;
final LabelList ll = (LabelList) l;
for(int i = 0; i < ll.size(); i++) {
- if(pattern.matcher(ll.get(i)).matches()) {
+ matcher.reset(ll.get(i));
+ if(matcher.matches()) {
good = true;
break;
}
@@ -124,7 +129,8 @@ public class ByLabelFilter extends AbstractStreamFilter {
}
}
else {
- if(!pattern.matcher(l.toString()).matches()) {
+ matcher.reset(l.toString());
+ if(!matcher.matches()) {
continue;
}
}
@@ -190,7 +196,7 @@ public class ByLabelFilter extends AbstractStreamFilter {
}
@Override
- protected Object makeInstance() {
+ protected ByLabelFilter makeInstance() {
return new ByLabelFilter(pattern, inverted);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/RandomSamplingStreamFilter.java
index a7e44d4d..3e1a3d89 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/RandomSamplingStreamFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,9 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
import java.util.Random;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
@@ -39,6 +41,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.RandomSamplingStreamFilter" })
public class RandomSamplingStreamFilter extends AbstractStreamFilter {
/**
* Probability
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ShuffleObjectsFilter.java
index 8afa8290..3fb77ce4 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ShuffleObjectsFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,8 +28,10 @@ import java.util.List;
import java.util.Random;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -40,6 +42,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.ShuffleObjectsFilter" })
public class ShuffleObjectsFilter implements ObjectFilter {
/**
* Class logger
@@ -73,18 +76,18 @@ public class ShuffleObjectsFilter implements ObjectFilter {
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Shuffling the data set");
}
final Random random = rnd.getSingleThreadedRandom();
final int size = objects.dataLength();
final int[] offsets = new int[size];
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
offsets[i] = i;
}
// Randomize the offset array
- for (int i = size; i > 1; i--) {
+ for(int i = size; i > 1; i--) {
final int j = random.nextInt(i);
// Swap the elements at positions j and i - 1:
final int temp = offsets[j];
@@ -93,11 +96,11 @@ public class ShuffleObjectsFilter implements ObjectFilter {
}
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
// Reorder column accordingly
List<?> in = objects.getColumn(j);
List<Object> data = new ArrayList<>(size);
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
data.add(in.get(offsets[i]));
}
bundle.appendColumn(objects.meta(j), data);
@@ -119,13 +122,13 @@ public class ShuffleObjectsFilter implements ObjectFilter {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
@Override
- protected Object makeInstance() {
+ protected ShuffleObjectsFilter makeInstance() {
return new ShuffleObjectsFilter(rnd);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/SortByLabelFilter.java
index d35d9cde..a6cef5fd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/SortByLabelFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,7 +28,9 @@ import java.util.List;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
@@ -39,6 +41,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
*
* @apiviz.uses de.lmu.ifi.dbs.elki.data.LabelList oneway - - «reads»
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.SortByLabelFilter" })
public class SortByLabelFilter implements ObjectFilter {
/**
* Class logger
@@ -54,22 +57,22 @@ public class SortByLabelFilter implements ObjectFilter {
@Override
public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Shuffling the data set");
}
// Prepare a reposition array for cheap resorting
final int size = objects.dataLength();
final int[] offsets = new int[size];
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
offsets[i] = i;
}
// Sort by labels - identify a label column
final int lblcol;
{
int lblc = -1;
- for (int i = 0; i < objects.metaLength(); i++) {
- if (TypeUtil.GUESSED_LABEL.isAssignableFromType(objects.meta(i))) {
+ for(int i = 0; i < objects.metaLength(); i++) {
+ if(TypeUtil.GUESSED_LABEL.isAssignableFromType(objects.meta(i))) {
lblc = i;
break;
}
@@ -86,11 +89,11 @@ public class SortByLabelFilter implements ObjectFilter {
});
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
// Reorder column accordingly
List<?> in = objects.getColumn(j);
List<Object> data = new ArrayList<>(size);
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
data.add(in.get(offsets[i]));
}
bundle.appendColumn(objects.meta(j), data);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java
new file mode 100644
index 00000000..7ec0a3a3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Filters for selecting and sorting data to process.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.selection; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java
index 462db9eb..8c1ef6cb 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,13 +33,12 @@ import java.util.Map;
import de.lmu.ifi.dbs.elki.data.ClassLabel;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.NumberVector.Factory;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.filter.ClassLabelFilter;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.typeconversions.ClassLabelFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
@@ -60,7 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*
* @param <V> Vector type
*/
-public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberVector<?>> implements ObjectFilter {
+public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberVector> implements ObjectFilter {
/**
* The dimensionality to which the data should be reduced.
*/
@@ -114,7 +113,7 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV
List<V> vectorcolumn = (List<V>) column;
final VectorFieldTypeInformation<?> vtype = (VectorFieldTypeInformation<?>) type;
@SuppressWarnings("unchecked")
- NumberVector.Factory<V, ?> factory = (NumberVector.Factory<V, ?>) vtype.getFactory();
+ NumberVector.Factory<V> factory = (NumberVector.Factory<V> ) vtype.getFactory();
int dim = vtype.getDimensionality();
if(tdim > dim) {
@@ -155,7 +154,7 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV
* @param factory Vector factory
* @return output type restriction
*/
- protected SimpleTypeInformation<?> convertedType(SimpleTypeInformation<?> in, Factory<V, ?> factory) {
+ protected SimpleTypeInformation<?> convertedType(SimpleTypeInformation<?> in, NumberVector.Factory<V> factory) {
return new VectorFieldTypeInformation<>(factory, tdim);
}
@@ -206,7 +205,7 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV
*
* @param <V> Vector type
*/
- public abstract static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public abstract static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* The number of dimensions to keep.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java
index d646b489..32024581 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -30,8 +30,9 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
@@ -54,6 +55,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.composedOf SingularValueDecomposition
+ *
* @param <O> Data type
*/
@Alias({ "mds" })
@@ -66,7 +69,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
/**
* Distance function to use.
*/
- PrimitiveDoubleDistanceFunction<? super O> dist = null;
+ PrimitiveDistanceFunction<? super O> dist = null;
/**
* Target dimensionality
@@ -79,7 +82,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
* @param tdim Target dimensionality.
* @param dist Distance function to use.
*/
- public ClassicMultidimensionalScalingTransform(int tdim, PrimitiveDoubleDistanceFunction<? super O> dist) {
+ public ClassicMultidimensionalScalingTransform(int tdim, PrimitiveDistanceFunction<? super O> dist) {
super();
this.tdim = tdim;
this.dist = dist;
@@ -105,14 +108,14 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
// Get the replacement type information
@SuppressWarnings("unchecked")
final List<O> castColumn = (List<O>) column;
- NumberVector.Factory<? extends NumberVector<?>, ?> factory = null;
+ NumberVector.Factory<? extends NumberVector> factory = null;
{
if (type instanceof VectorFieldTypeInformation) {
final VectorFieldTypeInformation<?> ctype = (VectorFieldTypeInformation<?>) type;
// Note two-step cast, to make stricter compilers happy.
@SuppressWarnings("unchecked")
- final VectorFieldTypeInformation<? extends NumberVector<?>> vtype = (VectorFieldTypeInformation<? extends NumberVector<?>>) ctype;
- factory = (NumberVector.Factory<? extends NumberVector<?>, ?>) vtype.getFactory();
+ final VectorFieldTypeInformation<? extends NumberVector> vtype = (VectorFieldTypeInformation<? extends NumberVector>) ctype;
+ factory = FilterUtil.guessFactory(vtype);
} else {
factory = DoubleVector.FACTORY;
}
@@ -128,16 +131,12 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
final O ox = castColumn.get(x);
for (int y = x + 1; y < size; y++) {
final O oy = castColumn.get(y);
- double distance = Math.abs(dist.doubleDistance(ox, oy));
+ double distance = Math.abs(dist.distance(ox, oy));
imat[x][y] = distance;
- if (dprog != null) {
- dprog.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(dprog);
}
}
- if (dprog != null) {
- dprog.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(dprog);
}
// Adjust distance matrix:
if (dist instanceof SquaredEuclideanDistanceFunction) {
@@ -230,7 +229,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
*
* @apiviz.exclude
*/
- public static class Parameterizer<O extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<O extends NumberVector> extends AbstractParameterizer {
/**
* Desired dimensionality.
*/
@@ -249,7 +248,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
/**
* Distance function to use.
*/
- PrimitiveDoubleDistanceFunction<? super O> dist = null;
+ PrimitiveDistanceFunction<? super O> dist = null;
@Override
protected void makeOptions(Parameterization config) {
@@ -260,7 +259,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
tdim = dimP.intValue();
}
- ObjectParameter<PrimitiveDoubleDistanceFunction<? super O>> distP = new ObjectParameter<>(DISTANCE_ID, PrimitiveDoubleDistanceFunction.class, SquaredEuclideanDistanceFunction.class);
+ ObjectParameter<PrimitiveDistanceFunction<? super O>> distP = new ObjectParameter<>(DISTANCE_ID, PrimitiveDistanceFunction.class, SquaredEuclideanDistanceFunction.class);
if (config.grab(distP)) {
dist = distP.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java
index 3b4193ad..c6bd02a9 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -55,10 +55,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.composedOf PCARunner
+ * @apiviz.composedOf CovarianceMatrix
+ * @apiviz.composedOf EigenPairFilter
+ *
* @param <O> Vector type
*/
@Alias({ "whiten", "whitening", "pca" })
-public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector<?>> extends AbstractVectorConversionFilter<O, O> {
+public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector> extends AbstractVectorConversionFilter<O, O> {
/**
* Class logger.
*/
@@ -122,7 +126,7 @@ public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector<?>
@Override
protected void prepareComplete() {
mean = covmat.getMeanVector().getArrayRef();
- PCAResult pcares = (new PCARunner<O>(null)).processCovarMatrix(covmat.destroyToSampleMatrix());
+ PCAResult pcares = (new PCARunner(null)).processCovarMatrix(covmat.destroyToSampleMatrix());
SortedEigenPairs eps = pcares.getEigenPairs();
covmat = null;
@@ -190,7 +194,7 @@ public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector<?>
*
* @apiviz.exclude
*/
- public static class Parameterizer<O extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<O extends NumberVector> extends AbstractParameterizer {
/**
* To specify the eigenvectors to keep.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/HistogramJitterFilter.java
index 453d294e..8c34ce37 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/HistogramJitterFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.transform;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,9 @@ import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.ExponentialDistribution;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -55,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* @param <V> Vector type
*/
@Description("Add uniform Jitter to a dataset, while preserving the total vector sum.")
-public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<V, V> {
+public class HistogramJitterFilter<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> {
/**
* Jitter amount.
*/
@@ -145,8 +146,8 @@ public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVe
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- DoubleParameter jitterP = new DoubleParameter(JITTER_ID);
- jitterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ DoubleParameter jitterP = new DoubleParameter(JITTER_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(jitterP)) {
jitter = jitterP.getValue().doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java
index 998c8931..9cb0b492 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,8 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.math.geodesy.EarthModel;
import de.lmu.ifi.dbs.elki.math.geodesy.SphericalVincentyEarthModel;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -39,13 +41,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.uses NumberVector
+ * @apiviz.composedOf EarthModel
+ *
* @param <V> Vector type.
*/
-public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStreamConversionFilter<V, V> {
+public class LatLngToECEFFilter<V extends NumberVector> extends AbstractStreamConversionFilter<V, V> {
/**
* Vector factory to use.
*/
- private NumberVector.Factory<V, ?> factory;
+ private NumberVector.Factory<V> factory;
/**
* Earth model to use.
@@ -69,14 +74,13 @@ public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return new VectorFieldTypeInformation<>(NumberVector.class, 2, 2);
+ return TypeUtil.NUMBER_VECTOR_FIELD_2D;
}
@Override
protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
- VectorFieldTypeInformation<V> vin = (VectorFieldTypeInformation<V>) in;
- factory = (NumberVector.Factory<V, ?>) vin.getFactory();
- return new VectorFieldTypeInformation<>(vin.getFactory(), 3, 3, in.getSerializer());
+ factory = FilterUtil.guessFactory(in);
+ return new VectorFieldTypeInformation<>(factory, 3, 3, in.getSerializer());
}
/**
@@ -88,7 +92,7 @@ public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
*
* @param <V> Vector type
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Earth model to use.
*/
@@ -98,7 +102,7 @@ public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<EarthModel> modelP = new ObjectParameter<>(EarthModel.MODEL_ID, EarthModel.class, SphericalVincentyEarthModel.class);
- if (config.grab(modelP)) {
+ if(config.grab(modelP)) {
model = modelP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java
index 76546d5c..537bfb20 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -59,7 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
*/
@Alias("lda")
@Reference(authors = "R. A. Fisher", title = "The use of multiple measurements in taxonomic problems", booktitle = "Annals of eugenics 7.2 (1936)", url = "http://dx.doi.org/10.1111/j.1469-1809.1936.tb02137.x")
-public class LinearDiscriminantAnalysisFilter<V extends NumberVector<?>> extends AbstractSupervisedProjectionVectorFilter<V> {
+public class LinearDiscriminantAnalysisFilter<V extends NumberVector> extends AbstractSupervisedProjectionVectorFilter<V> {
/**
* Class logger.
*/
@@ -156,7 +156,7 @@ public class LinearDiscriminantAnalysisFilter<V extends NumberVector<?>> extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractSupervisedProjectionVectorFilter.Parameterizer<V> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractSupervisedProjectionVectorFilter.Parameterizer<V> {
@Override
protected LinearDiscriminantAnalysisFilter<V> makeInstance() {
return new LinearDiscriminantAnalysisFilter<>(tdim);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java
index ea0d4ef2..d5fba25d 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,8 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.math.geodesy.EarthModel;
import de.lmu.ifi.dbs.elki.math.geodesy.SphericalVincentyEarthModel;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -39,13 +41,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.uses NumberVector
+ * @apiviz.composedOf EarthModel
+ *
* @param <V> Vector type.
*/
-public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStreamConversionFilter<V, V> {
+public class LngLatToECEFFilter<V extends NumberVector> extends AbstractStreamConversionFilter<V, V> {
/**
* Vector factory to use.
*/
- private NumberVector.Factory<V, ?> factory;
+ private NumberVector.Factory<V> factory;
/**
* Earth model to use.
@@ -69,14 +74,13 @@ public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return new VectorFieldTypeInformation<>(NumberVector.class, 2, 2);
+ return TypeUtil.NUMBER_VECTOR_FIELD_2D;
}
@Override
protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
- VectorFieldTypeInformation<V> vin = (VectorFieldTypeInformation<V>) in;
- factory = (NumberVector.Factory<V, ?>) vin.getFactory();
- return new VectorFieldTypeInformation<>(vin.getFactory(), 3, 3, in.getSerializer());
+ factory = FilterUtil.guessFactory(in);
+ return new VectorFieldTypeInformation<>(factory, 3, 3, in.getSerializer());
}
/**
@@ -88,7 +92,7 @@ public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
*
* @param <V> Vector type
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Earth model to use.
*/
@@ -98,7 +102,7 @@ public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<EarthModel> modelP = new ObjectParameter<>(EarthModel.MODEL_ID, EarthModel.class, SphericalVincentyEarthModel.class);
- if (config.grab(modelP)) {
+ if(config.grab(modelP)) {
model = modelP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java
index e6d0d15d..115d77dd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -51,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
*
* @param <V> Vector type
*/
-public class NumberVectorFeatureSelectionFilter<V extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<V, V> {
+public class NumberVectorFeatureSelectionFilter<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> {
/**
* Keeps the selection of the subspace to project onto.
*/
@@ -99,10 +99,8 @@ public class NumberVectorFeatureSelectionFilter<V extends NumberVector<?>> exten
}
/**
- * <p>
* Provides a BitSet with the bits set to true corresponding to the selected
* attributes in {@link Parameterizer#SELECTED_ATTRIBUTES_ID}.
- * </p>
*
* The index in the BitSet is shifted to the left by one, i.e., index 0 in the
* BitSet relates to the first attribute.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java
index 4086270c..dfca33ec 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorStreamConversionFilter;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.Util;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -51,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @param <V> vector type
*/
-public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<V, V> {
+public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> {
/**
* The selected attributes.
*/
@@ -155,10 +155,10 @@ public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector<?>>
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter kP = new IntParameter(NUMBER_SELECTED_ATTRIBUTES_ID, 1);
- kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ IntParameter kP = new IntParameter(NUMBER_SELECTED_ATTRIBUTES_ID, 1) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
- k = kP.getValue().intValue();
+ k = kP.intValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
if(config.grab(rndP)) {
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java
new file mode 100644
index 00000000..4e5fe9b3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java
@@ -0,0 +1,436 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.transform;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorConversionFilter;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.MeanVarianceMinMax;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeSetGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.EqualSizeGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
+
+/**
+ * A filter to perturb the values by adding micro-noise.
+ *
+ * The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a
+ * specified standard deviation or by a uniform distribution with a specified
+ * range. The standard deviation or the range can be scaled, attribute-wise, to
+ * a given percentage of the original standard deviation in the data
+ * distribution (assuming a Gaussian distribution there), or to a percentage of
+ * the extension in each attribute ({@code maximumValue - minimumValue}).
+ *
+ * This filter has a potentially wide use but has been implemented for the following publication:
+ *
+ * Reference:
+ * <p>
+ * A. Zimek, R. J. G. B. Campello, J. Sander:</br>
+ * Data Perturbation for Outlier Detection Ensembles.<\br>
+ * In: Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014.
+ * </p>
+ *
+ * @author Arthur Zimek
+ */
+@Title("Data Perturbation for Outlier Detection Ensembles")
+@Description("A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).")
+@Reference(authors = "A. Zimek, R. J. G. B. Campello, J. Sander",//
+title = "Data Perturbation for Outlier Detection Ensembles", //
+booktitle = "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", //
+url = "http://dx.doi.org/10.1145/2618243.2618257")
+public class PerturbationFilter<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> {
+ /**
+ * Class logger
+ */
+ private static final Logging LOG = Logging.getLogger(PerturbationFilter.class);
+
+ /**
+ * Scaling reference options.
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.exclude
+ */
+ public static enum ScalingReference {
+ UNITCUBE, STDDEV, MINMAX
+ }
+
+ /**
+ * Nature of the noise distribution.
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.exclude
+ */
+ public static enum NoiseDistribution {
+ GAUSSIAN, UNIFORM
+ }
+
+ /**
+ * Which reference to use for scaling the noise.
+ */
+ private ScalingReference scalingreference;
+
+ /**
+ * Nature of the noise distribution.
+ */
+ private NoiseDistribution noisedistribution;
+
+ /**
+ * Random object to generate the attribute-wise seeds for the noise.
+ */
+ private final Random RANDOM;
+
+ /**
+ * Percentage of the variance of the random noise generation, given the
+ * variance of the corresponding attribute in the data.
+ */
+ private double percentage;
+
+ /**
+ * Temporary storage used during initialization.
+ */
+ private MeanVarianceMinMax[] mvs = null;
+
+ /**
+ * Stores the scaling reference in each dimension.
+ */
+ private double[] scalingreferencevalues = new double[0];
+
+ /**
+ * The random objects to generate noise distributions independently for each
+ * attribute.
+ */
+ private Random[] randomPerAttribute = null;
+
+ /**
+ * Stores the maximum in each dimension.
+ */
+ private double[] maxima;
+
+ /**
+ * Stores the minimum in each dimension.
+ */
+ private double[] minima;
+
+ /**
+ * Stores the dimensionality from the preprocessing.
+ */
+ private int dimensionality = 0;
+
+ /**
+ * Constructor.
+ *
+ * @param seed Seed value, may be {@code null} for a random seed.
+ * @param percentage Relative amount of jitter to add
+ * @param scalingreference Scaling reference
+ * @param minima Preset minimum values. May be {@code null}.
+ * @param maxima Preset maximum values. May be {@code null}.
+ * @param noisedistribution Nature of the noise distribution.
+ */
+ public PerturbationFilter(Long seed, double percentage, ScalingReference scalingreference, double[] minima, double[] maxima, NoiseDistribution noisedistribution) {
+ super();
+ this.percentage = percentage;
+ this.scalingreference = scalingreference;
+ this.minima = minima;
+ this.maxima = maxima;
+ this.noisedistribution = noisedistribution;
+ this.RANDOM = (seed == null) ? new Random() : new Random(seed);
+ }
+
+ @Override
+ protected boolean prepareStart(SimpleTypeInformation<V> in) {
+ if(scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) {
+ dimensionality = minima.length;
+ scalingreferencevalues = new double[dimensionality];
+ randomPerAttribute = new Random[dimensionality];
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage;
+ if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
+ scalingreferencevalues[d] = percentage;
+ }
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ }
+ return false;
+ }
+ if(scalingreference == ScalingReference.UNITCUBE) {
+ return false;
+ }
+ return (scalingreferencevalues.length == 0);
+ }
+
+ @Override
+ protected void prepareProcessInstance(V featureVector) {
+ // First object? Then init. (We didn't have a dimensionality before!)
+ if(mvs == null) {
+ dimensionality = featureVector.getDimensionality();
+ mvs = MeanVarianceMinMax.newArray(dimensionality);
+ }
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ mvs[d].put(featureVector.doubleValue(d));
+ }
+ }
+
+ @Override
+ protected void prepareComplete() {
+ StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
+ scalingreferencevalues = new double[dimensionality];
+ randomPerAttribute = new Random[dimensionality];
+ if(scalingreference == ScalingReference.STDDEV) {
+ if(buf != null) {
+ buf.append("Standard deviation per attribute: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
+ if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
+ scalingreferencevalues[d] = percentage;
+ }
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ if(buf != null) {
+ buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
+ }
+ }
+ }
+ else if(scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) {
+ if(buf != null) {
+ buf.append("extension per attribute: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
+ if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
+ scalingreferencevalues[d] = percentage;
+ }
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ if(buf != null) {
+ buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
+ }
+ }
+ }
+ mvs = null;
+ if(buf != null) {
+ LOG.debugFine(buf.toString());
+ }
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ if(scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) {
+ dimensionality = featureVector.getDimensionality();
+ scalingreferencevalues = new double[dimensionality];
+ randomPerAttribute = new Random[dimensionality];
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = percentage;
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ }
+ }
+ if(scalingreferencevalues.length != featureVector.getDimensionality()) {
+ throw new IllegalArgumentException("FeatureVectors and given Minima/Maxima differ in length.");
+ }
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ if(this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) {
+ values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d];
+ }
+ else if(this.noisedistribution.equals(NoiseDistribution.UNIFORM)) {
+ values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d];
+ }
+ }
+ return factory.newNumberVector(values);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
+ initializeOutputType(in);
+ return in;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for minimum.
+ */
+ public static final OptionID MINIMA_ID = new OptionID("perturbationfilter.min", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for maximum.
+ */
+ public static final OptionID MAXIMA_ID = new OptionID("perturbationfilter.max", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Stores the maximum in each dimension.
+ */
+ private double[] maxima = new double[0];
+
+ /**
+ * Stores the minimum in each dimension.
+ */
+ private double[] minima = new double[0];
+
+ /**
+ * Optional parameter to specify a seed for random Gaussian noise
+ * generation. If unused, system time is used as seed.
+ * <p>
+ * Key: {@code -perturbationfilter.seed}
+ * </p>
+ */
+ public static final OptionID SEED_ID = new OptionID("perturbationfilter.seed", "Seed for random noise generation.");
+
+ /**
+ * Seed for randomly shuffling the rows of the database. If null, system
+ * time is used as seed.
+ */
+ protected Long seed = null;
+
+ /**
+ * Optional parameter to specify a percentage of the standard deviation of
+ * the random Gaussian noise generation, given the standard deviation of the
+ * corresponding attribute in the original data distribution (assuming a
+ * Gaussian there).
+ *
+ * <p>
+ * Key: {@code -perturbationfilter.percentage}
+ * </p>
+ * <p>
+ * Default: <code>0.01</code>
+ * </p>
+ * <p>
+ * Constraint: 0 &lt; percentage &leq;1
+ * </p>
+ */
+ public static final OptionID PERCENTAGE_ID = new OptionID("perturbationfilter.percentage", "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there).");
+
+ /**
+ * Parameter for selecting scaling reference.
+ * <p>
+ * Key: {@code -perturbationfilter.scalingreference}
+ * </p>
+ * <p>
+ * Default: <code>ScalingReference.UNITCUBE</code>
+ * </p>
+ */
+ public static final OptionID SCALINGREFERENCE_ID = new OptionID("perturbationfilter.scalingreference", "The reference for scaling the Gaussian noise. Default is " + ScalingReference.UNITCUBE + ", parameter " + PERCENTAGE_ID.getName() + " will then directly define the standard deviation of all noise Gaussians. For options " + ScalingReference.STDDEV + " and " + ScalingReference.MINMAX + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians.");
+
+ /**
+ * Parameter for selecting the noise distribution.
+ *
+ * <p>
+ * Key: {@code -perturbationfilter.noisedistribution}
+ * </p>
+ * <p>
+ * Default: <code>NoiseDistribution.UNIFORM</code>
+ * </p>
+ *
+ */
+ public static final OptionID NOISEDISTRIBUTION_ID = new OptionID("perturbationfilter.noisedistribution", "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM);
+
+ /**
+ * Percentage of the variance of the random Gaussian noise generation or of
+ * the range of the uniform distribution, given the variance of the
+ * corresponding attribute in the data.
+ */
+ protected double percentage;
+
+ /**
+ * The option which reference to use for scaling the noise.
+ */
+ protected ScalingReference scalingreference;
+
+ /**
+ * The option which nature of noise distribution to choose.
+ */
+ protected NoiseDistribution noisedistribution;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ EnumParameter<ScalingReference> scalingReferenceP = new EnumParameter<>(SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE);
+ if(config.grab(scalingReferenceP)) {
+ scalingreference = scalingReferenceP.getValue();
+ }
+ EnumParameter<NoiseDistribution> noisedistributionP = new EnumParameter<>(NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM);
+ if(config.grab(noisedistributionP)) {
+ noisedistribution = noisedistributionP.getValue();
+ }
+ DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01);
+ percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(percentageP)) {
+ percentage = percentageP.getValue();
+ }
+ LongParameter seedP = new LongParameter(SEED_ID);
+ seedP.setOptional(true);
+ if(config.grab(seedP)) {
+ seed = seedP.getValue();
+ }
+ DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID);
+ minimaP.setOptional(true);
+ if(config.grab(minimaP)) {
+ minima = ArrayLikeUtil.toPrimitiveDoubleArray(minimaP.getValue());
+ }
+ DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID);
+ maximaP.setOptional(true);
+ if(config.grab(maximaP)) {
+ maxima = ArrayLikeUtil.toPrimitiveDoubleArray(maximaP.getValue());
+ }
+
+ config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP));
+ config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP));
+ }
+
+ @Override
+ protected PerturbationFilter<V> makeInstance() {
+ return new PerturbationFilter<>(seed, percentage, scalingreference, minima, maxima, noisedistribution);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java
index af3f4c6e..e58ea3b0 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -37,6 +37,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.composedOf Projection
+ *
* @param <I> Input type
* @param <O> Output type
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java
index 7082f103..3a81b989 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFilter.java
index 020dcb31..582eba65 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,6 +31,8 @@ import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -46,6 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @apiviz.uses LabelList oneway - - «reads»
* @apiviz.has ClassLabel
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ClassLabelFilter" })
public class ClassLabelFilter implements ObjectFilter {
/**
* The index of the label to be used as class label, null if no class label is
@@ -180,7 +183,7 @@ public class ClassLabelFilter implements ObjectFilter {
}
@Override
- protected Object makeInstance() {
+ protected ClassLabelFilter makeInstance() {
return new ClassLabelFilter(classLabelIndex, classLabelFactory);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFromPatternFilter.java
index 517eb301..3ced4e2f 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFromPatternFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,7 +23,8 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.BitSet;
+import gnu.trove.list.array.TIntArrayList;
+
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
@@ -31,6 +32,8 @@ import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -42,6 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ClassLabelFromPatternFilter" })
public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
/**
* Current meta data
@@ -51,7 +55,7 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
/**
* Bitset of label columns
*/
- BitSet labelcols = new BitSet();
+ TIntArrayList labelcols = new TIntArrayList();
/**
* Label to return for positive matches.
@@ -98,16 +102,16 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
@Override
public BundleMeta getMeta() {
- if (meta == null) {
+ if(meta == null) {
// Rebuild metadata.
BundleMeta origmeta = source.getMeta();
meta = new BundleMeta(origmeta.size() + 1);
meta.add(TypeUtil.SIMPLE_CLASSLABEL);
labelcols.clear();
- for (int i = 0; i < origmeta.size(); i++) {
+ for(int i = 0; i < origmeta.size(); i++) {
final SimpleTypeInformation<?> orig = origmeta.get(i);
- if (TypeUtil.GUESSED_LABEL.isAssignableFromType(orig)) {
- labelcols.set(i);
+ if(TypeUtil.GUESSED_LABEL.isAssignableFromType(orig)) {
+ labelcols.add(i);
}
meta.add(orig);
}
@@ -117,27 +121,27 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
@Override
public Object data(int rnum) {
- if (rnum > 0) {
+ if(rnum > 0) {
return source.data(rnum - 1);
}
- if (meta == null) {
+ if(meta == null) {
getMeta(); // Trigger build
}
- for (int i = labelcols.nextSetBit(0); i >= 0; i = labelcols.nextSetBit(i + 1)) {
- Object o = source.data(i);
- if (o == null) {
+ for(int i = 0; i < labelcols.size(); i++) {
+ Object o = source.data(labelcols.get(i));
+ if(o == null) {
continue;
}
- if (o instanceof LabelList) {
+ if(o instanceof LabelList) {
final LabelList ll = (LabelList) o;
for(int j = 0; j < ll.size(); j++) {
- if (pattern.matcher(ll.get(j)).find()) {
+ if(pattern.matcher(ll.get(j)).find()) {
return positive;
}
}
continue;
}
- if (pattern.matcher(o.toString()).find()) {
+ if(pattern.matcher(o.toString()).find()) {
return positive;
}
}
@@ -147,7 +151,7 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
@Override
public Event nextEvent() {
final Event ev = source.nextEvent();
- if (Event.META_CHANGED.equals(ev)) {
+ if(Event.META_CHANGED.equals(ev)) {
meta = null;
}
return ev;
@@ -191,17 +195,17 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
super.makeOptions(config);
PatternParameter patternP = new PatternParameter(PATTERN_ID);
- if (config.grab(patternP)) {
+ if(config.grab(patternP)) {
pattern = patternP.getValue();
}
StringParameter positiveP = new StringParameter(POSITIVE_ID, "positive");
- if (config.grab(positiveP)) {
+ if(config.grab(positiveP)) {
positive = positiveP.getValue();
}
StringParameter negativeP = new StringParameter(NEGATIVE_ID, "negative");
- if (config.grab(negativeP)) {
+ if(config.grab(negativeP)) {
negative = negativeP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ExternalIDFilter.java
index 17538dc9..3947a7cd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ExternalIDFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,6 +31,8 @@ import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -44,7 +46,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
* @apiviz.uses LabelList oneway - - «reads»
* @apiviz.has ExternalID oneway - - «produces»
*/
-// TODO: use a non-string class for external ids?
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ExternalIDFilter" })
public class ExternalIDFilter implements ObjectFilter {
/**
* The index of the label to be used as external Id.
@@ -143,7 +145,7 @@ public class ExternalIDFilter implements ObjectFilter {
}
@Override
- protected Object makeInstance() {
+ protected ExternalIDFilter makeInstance() {
return new ExternalIDFilter(externalIdIndex);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java
new file mode 100644
index 00000000..97a5d59d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java
@@ -0,0 +1,124 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.FeatureVector;
+import de.lmu.ifi.dbs.elki.data.type.MultivariateSeriesTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * Class to "fold" a flat number vector into a multivariate time series.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class MultivariateTimeSeriesFilter<V extends FeatureVector<?>> extends AbstractStreamConversionFilter<V, V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(MultivariateTimeSeriesFilter.class);
+
+ /**
+ * Number of variates to use.
+ */
+ int variates;
+
+ /**
+ * Constructor.
+ *
+ * @param variates Number of variates.
+ */
+ public MultivariateTimeSeriesFilter(int variates) {
+ super();
+ this.variates = variates;
+ }
+
+ @Override
+ protected V filterSingleObject(V obj) {
+ if(obj.getDimensionality() % variates != 0) {
+ throw new AbortException("Vector length " + obj.getDimensionality() + " not divisible by the number of variates " + variates);
+ }
+ return obj;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.FEATURE_VECTORS;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
+ VectorTypeInformation<V> vin = (VectorTypeInformation<V>) in;
+ return new MultivariateSeriesTypeInformation<>(vin.getFactory(), in.getSerializer(), vin.mindim(), vin.maxdim(), variates);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends FeatureVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter for specifying the number of variates of this series.
+ */
+ public static final OptionID VARIATES_ID = new OptionID("series.variates", "Number of variates this time series has.");
+
+ /**
+ * Number of variates to use.
+ */
+ int variates;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ IntParameter variatesP = new IntParameter(VARIATES_ID)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(variatesP)) {
+ variates = variatesP.intValue();
+ if(variates == 1) {
+ LOG.warning("For univariate series, you should not need to use this filter.");
+ }
+ }
+ }
+
+ @Override
+ protected MultivariateTimeSeriesFilter<V> makeInstance() {
+ return new MultivariateTimeSeriesFilter<>(variates);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SparseVectorFieldFilter.java
index 97960907..2b84f0a6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SparseVectorFieldFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,10 @@ import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractConversionFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
* Class that turns sparse float vectors into a proper vector field, by setting
@@ -37,7 +40,8 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
*
* @param <V> Vector type
*/
-public class SparseVectorFieldFilter<V extends SparseNumberVector<?>> extends AbstractConversionFilter<V, V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.SparseVectorFieldFilter" })
+public class SparseVectorFieldFilter<V extends SparseNumberVector> extends AbstractConversionFilter<V, V> {
/**
* Class logger.
*/
@@ -79,7 +83,7 @@ public class SparseVectorFieldFilter<V extends SparseNumberVector<?>> extends Ab
@Override
protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
- SparseNumberVector.Factory<V, ?> factory = (SparseNumberVector.Factory<V, ?>) FilterUtil.guessFactory(in);
+ SparseNumberVector.Factory<V> factory = (SparseNumberVector.Factory<V>) FilterUtil.guessFactory(in);
return new VectorFieldTypeInformation<>(factory, maxdim);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SplitNumberVectorFilter.java
index 6ac046ec..81f640df 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SplitNumberVectorFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,11 +27,13 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.NumberVector.Factory;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -48,7 +50,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
*
* @param <V> Vector type
*/
-public class SplitNumberVectorFilter<V extends NumberVector<?>> implements ObjectFilter {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.SplitNumberVectorFilter" })
+public class SplitNumberVectorFilter<V extends NumberVector> implements ObjectFilter {
/**
* Selected dimensions.
*/
@@ -83,7 +86,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec
// Should be a vector type after above test.
@SuppressWarnings("unchecked")
final VectorFieldTypeInformation<V> vtype = VectorFieldTypeInformation.class.cast(type);
- Factory<V, ?> factory = FilterUtil.guessFactory(vtype);
+ NumberVector.Factory<V> factory = FilterUtil.guessFactory(vtype);
// Get the replacement type informations
VectorFieldTypeInformation<V> type1 = new VectorFieldTypeInformation<>(factory, dims.length);
@@ -144,7 +147,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec
for(int i = 1; i < dims.length; i++) {
m = Math.max(dims[i], m);
}
- return new VectorFieldTypeInformation<>(NumberVector.class, m, Integer.MAX_VALUE);
+ return VectorFieldTypeInformation.typeRequest(NumberVector.class, m, Integer.MAX_VALUE);
}
/**
@@ -154,7 +157,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* The parameter listing the split dimensions.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java
new file mode 100644
index 00000000..d582c8d2
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Filters to perform data type conversions.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions; \ No newline at end of file