summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/filter
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/filter')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractFeatureSelectionFilter.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractRandomFeatureSelectionFilter.java19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java117
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java50
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/FilterByLabelFilter.java)111
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorProjectionFilter.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorRandomProjectionFilter.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java57
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/FilterNoMissingValuesFilter.java)10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorProjectionFilter.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorRandomProjectionFilter.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java75
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java43
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractNormalization.java)5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseErfNormalization.java)10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseMinMaxNormalization.java)20
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseVarianceNormalization.java)32
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/InverseDocumentFrequencyNormalization.java)27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java115
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/NonNumericFeaturesException.java)4
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/Normalization.java)5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java113
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/TFIDFNormalization.java)12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java26
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java2
33 files changed, 779 insertions, 170 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
index 990458bf..34fb6bad 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -65,10 +65,7 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
// Get the replacement type information
@SuppressWarnings("unchecked")
final SimpleTypeInformation<I> castType = (SimpleTypeInformation<I>) type;
- @SuppressWarnings("unchecked")
- final List<O> castColumn = (List<O>) column;
- bundle.appendColumn(convertedType(castType), castColumn);
-
+
// When necessary, perform an initialization scan
if(prepareStart(castType)) {
for(Object o : column) {
@@ -79,6 +76,10 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
prepareComplete();
}
+ @SuppressWarnings("unchecked")
+ final List<O> castColumn = (List<O>) column;
+ bundle.appendColumn(convertedType(castType), castColumn);
+
// Normalization scan
for(int i = 0; i < objects.dataLength(); i++) {
@SuppressWarnings("unchecked")
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractFeatureSelectionFilter.java
index d53dfb94..009296b1 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractFeatureSelectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractFeatureSelectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -45,7 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
* @param <V> the type of FeatureVector contained in both the original and
* projected data.
*/
-public abstract class AbstractFeatureSelectionFilter<V extends FeatureVector<?, ?>> extends AbstractConversionFilter<V, V> {
+public abstract class AbstractFeatureSelectionFilter<V extends FeatureVector<?, ?>> extends AbstractStreamConversionFilter<V, V> {
/**
* <p>
* Selected attributes parameter.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractRandomFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractRandomFeatureSelectionFilter.java
index 975d5bd5..b52c7887 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractRandomFeatureSelectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractRandomFeatureSelectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -45,14 +45,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*
* The cardinality of the subset of attributes is specified as a parameter.
*
- *
* @author Arthur Zimek
* @author Erich Schubert
*
* @param <V> the type of FeatureVector contained in both the original data of
* the base parser and the projected data of this ProjectionParser
*/
-public abstract class AbstractRandomFeatureSelectionFilter<V extends FeatureVector<?, ?>> extends AbstractConversionFilter<V, V> {
+public abstract class AbstractRandomFeatureSelectionFilter<V extends FeatureVector<?, ?>> extends AbstractStreamConversionFilter<V, V> {
/**
* The selected attributes
*/
@@ -94,13 +93,17 @@ public abstract class AbstractRandomFeatureSelectionFilter<V extends FeatureVect
super();
this.k = dim;
}
-
- @Override
- protected boolean prepareStart(SimpleTypeInformation<V> in) {
+
+ /**
+ * Initialize random attributes.
+ *
+ * Invoke this from {@link #convertedType}!
+ *
+ * @param in Type information.
+ */
+ void initializeRandomAttributes(SimpleTypeInformation<V> in) {
int d = ((VectorFieldTypeInformation<V>) in).dimensionality();
selectedAttributes = Util.randomBitSet(k, d, random);
- // We don't need the full loop, so return false.
- return false;
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
new file mode 100644
index 00000000..1c8acb72
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
@@ -0,0 +1,117 @@
+package de.lmu.ifi.dbs.elki.datasource.filter;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+
+/**
+ * Abstract base class for simple conversion filters such as normalizations and
+ * projections.
+ *
+ * @author Erich Schubert
+ *
+ * @param <I> Input object type
+ * @param <O> Input object type
+ */
+public abstract class AbstractStreamConversionFilter<I, O> extends AbstractStreamFilter {
+ /**
+ * The filtered meta
+ */
+ BundleMeta meta;
+
+ /**
+ * The column to filter
+ */
+ int column = -1;
+
+ @Override
+ public BundleMeta getMeta() {
+ return meta;
+ }
+
+ @Override
+ public Object data(int rnum) {
+ if(rnum != column) {
+ return source.data(rnum);
+ }
+ // Convert:
+ @SuppressWarnings("unchecked")
+ final I obj = (I) source.data(rnum);
+ return filterSingleObject(obj);
+ }
+
+ @Override
+ public Event nextEvent() {
+ Event ev = source.nextEvent();
+ if(ev == Event.META_CHANGED) {
+ if(meta == null) {
+ meta = new BundleMeta();
+ }
+ BundleMeta origmeta = source.getMeta();
+ for(int i = meta.size(); i < origmeta.size(); i++) {
+ if(column < 0) {
+ @SuppressWarnings("unchecked")
+ SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) origmeta.get(i);
+ // Test whether this type matches
+ if(getInputTypeRestriction().isAssignableFromType(type)) {
+ @SuppressWarnings("unchecked")
+ final SimpleTypeInformation<I> castType = (SimpleTypeInformation<I>) type;
+ meta.add(convertedType(castType));
+ column = i;
+ continue;
+ }
+ }
+ meta.add(origmeta.get(i));
+ }
+ }
+ return ev;
+ }
+
+ /**
+ * Normalize a single instance.
+ *
+ * You can implement this as UnsupportedOperationException if you override
+ * both public "normalize" functions!
+ *
+ * @param obj Database object to normalize
+ * @return Normalized database object
+ */
+ abstract protected O filterSingleObject(I obj);
+
+ /**
+ * Get the input type restriction used for negotiating the data query.
+ *
+ * @return Type restriction
+ */
+ abstract protected SimpleTypeInformation<? super I> getInputTypeRestriction();
+
+ /**
+ * Get the output type from the input type after conversion.
+ *
+ * @param in input type restriction
+ * @return output type restriction
+ */
+ abstract protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<I> in);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
new file mode 100644
index 00000000..368be1a2
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
@@ -0,0 +1,50 @@
+package de.lmu.ifi.dbs.elki.datasource.filter;
+
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+/**
+ * Abstract base class for streaming filters.
+ *
+ * @author Erich Schubert
+ */
+public abstract class AbstractStreamFilter implements StreamFilter {
+ /**
+ * Data source
+ */
+ protected BundleStreamSource source = null;
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ init(new StreamFromBundle(objects));
+ return MultipleObjectsBundle.fromStream(this);
+ }
+
+ @Override
+ public void init(BundleStreamSource source) {
+ this.source = source;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java
index b950080d..ebf01cfd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterByLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,12 +23,11 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.ArrayList;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -43,79 +42,103 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
*
* @apiviz.uses LabelList oneway - - «reads»
*/
-public class FilterByLabelFilter implements ObjectFilter {
+public class ByLabelFilter extends AbstractStreamFilter {
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(FilterByLabelFilter.class);
+ private static final Logging logger = Logging.getLogger(ByLabelFilter.class);
/**
* The filter pattern
*/
private final Pattern pattern;
-
+
/**
* Inversion flag
*/
private final boolean inverted;
/**
+ * Label column
+ */
+ private int lblcol = -1;
+
+ /**
* Constructor.
*
* @param pattern Filter pattern
* @param inverted Inversion flag
*/
- public FilterByLabelFilter(Pattern pattern, boolean inverted) {
+ public ByLabelFilter(Pattern pattern, boolean inverted) {
super();
this.pattern = pattern;
this.inverted = inverted;
}
@Override
- public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if(logger.isDebugging()) {
- logger.debug("Filtering the data set");
- }
+ public BundleMeta getMeta() {
+ return source.getMeta();
+ }
- // Identify a label column
- final int lblcol;
- {
- int lblc = -1;
- for(int i = 0; i < objects.metaLength(); i++) {
- if(TypeUtil.GUESSED_LABEL.isAssignableFromType(objects.meta(i))) {
- lblc = i;
- break;
- }
- }
- lblcol = lblc; // make static
- }
+ @Override
+ public Object data(int rnum) {
+ return source.data(rnum);
+ }
- MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for(int j = 0; j < objects.metaLength(); j++) {
- bundle.appendColumn(objects.meta(j), new ArrayList<Object>());
- }
- for(int i = 0; i < objects.dataLength(); i++) {
- Object l = objects.data(i, lblcol);
- if(l instanceof LabelList) {
- boolean good = false;
- for(String label : (LabelList) l) {
- if(pattern.matcher(label).matches()) {
- good = true;
- break;
+ @Override
+ public Event nextEvent() {
+ while(true) {
+ Event ev = source.nextEvent();
+ switch(ev){
+ case END_OF_STREAM:
+ if (lblcol < 0) {
+ logger.warning("By label filter was used, but never saw a label relation!");
+ }
+ return Event.END_OF_STREAM;
+ case META_CHANGED:
+ // Search for the first label column
+ if(lblcol < 0) {
+ BundleMeta meta = source.getMeta();
+ for(int i = 0; i < meta.size(); i++) {
+ if(TypeUtil.GUESSED_LABEL.isAssignableFromType(meta.get(i))) {
+ lblcol = i;
+ break;
+ }
}
}
- if(good == inverted) {
- continue;
+ return Event.META_CHANGED;
+ case NEXT_OBJECT:
+ if(lblcol > 0) {
+ Object l = source.data(lblcol);
+ if(l instanceof LabelList) {
+ boolean good = false;
+ for(String label : (LabelList) l) {
+ if(pattern.matcher(label).matches()) {
+ good = true;
+ break;
+ }
+ }
+ if(good == inverted) {
+ continue;
+ }
+ }
+ else {
+ if(!pattern.matcher(l.toString()).matches()) {
+ continue;
+ }
+ }
}
- }
- else {
- if(!pattern.matcher(l.toString()).matches()) {
- continue;
+ else {
+ // No labels known yet.
+ if(!inverted) {
+ continue;
+ }
}
+ return Event.NEXT_OBJECT;
+ default:
+ logger.warning("Unknown event: " + ev);
}
- bundle.appendSimple(objects.getRow(i));
}
- return bundle;
}
/**
@@ -167,7 +190,7 @@ public class FilterByLabelFilter implements ObjectFilter {
@Override
protected Object makeInstance() {
- return new FilterByLabelFilter(pattern, inverted);
+ return new ByLabelFilter(pattern, inverted);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java
index 1c9a2274..95596773 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorProjectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorProjectionFilter.java
index 21b00b0d..4793b041 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorProjectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorProjectionFilter.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -61,7 +61,7 @@ public class DoubleVectorProjectionFilter extends AbstractFeatureSelectionFilter
@Override
protected SimpleTypeInformation<? super DoubleVector> convertedType(SimpleTypeInformation<DoubleVector> in) {
- return new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, getDimensionality(), new DoubleVector(new double[getDimensionality()]));
+ return new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, DoubleVector.STATIC, getDimensionality(), DoubleVector.STATIC);
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorRandomProjectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorRandomProjectionFilter.java
index 802b00d6..5aa31967 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorRandomProjectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/DoubleVectorRandomProjectionFilter.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -61,7 +61,8 @@ public class DoubleVectorRandomProjectionFilter extends AbstractRandomFeatureSel
@Override
protected SimpleTypeInformation<? super DoubleVector> convertedType(SimpleTypeInformation<DoubleVector> in) {
- return new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, k, new DoubleVector(new double[k]));
+ initializeRandomAttributes(in);
+ return new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, DoubleVector.STATIC, k, DoubleVector.STATIC);
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java
index c53c2e4d..f48810f5 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -48,7 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
// TODO: use a non-string class for external ids?
public class ExternalIDFilter implements ObjectFilter {
/**
- * Optional parameter that specifies the index of the label to be used as
+ * Parameter that specifies the index of the label to be used as
* external Id, must be an integer equal to or greater than 0.
* <p>
* Key: {@code -dbc.externalIdIndex}
@@ -57,8 +57,7 @@ public class ExternalIDFilter implements ObjectFilter {
public static final OptionID EXTERNALID_INDEX_ID = OptionID.getOrCreateOptionID("dbc.externalIdIndex", "The index of the label to be used as external Id.");
/**
- * The index of the label to be used as external Id, null if no external id
- * index is specified.
+ * The index of the label to be used as external Id.
*/
private final int externalIdIndex;
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
index b49494e4..c34ecbe7 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,13 +23,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.ArrayList;
-import java.util.List;
-
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -43,7 +40,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*
* @apiviz.has DBID oneway - - «produces»
*/
-public class FixedDBIDsFilter implements ObjectFilter {
+public class FixedDBIDsFilter extends AbstractStreamFilter {
/**
* Optional parameter to specify the first object ID to use.
* <p>
@@ -53,9 +50,14 @@ public class FixedDBIDsFilter implements ObjectFilter {
public static final OptionID IDSTART_ID = OptionID.getOrCreateOptionID("dbc.startid", "Object ID to start counting with");
/**
- * The first ID to assign
+ * The filtered meta
+ */
+ BundleMeta meta;
+
+ /**
+ * The next ID to assign
*/
- final int startid;
+ int curid = 0;
/**
* Constructor.
@@ -64,22 +66,39 @@ public class FixedDBIDsFilter implements ObjectFilter {
*/
public FixedDBIDsFilter(int startid) {
super();
- this.startid = startid;
+ this.curid = startid;
+ }
+
+ @Override
+ public BundleMeta getMeta() {
+ return meta;
}
@Override
- public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- List<DBID> ids = new ArrayList<DBID>(objects.dataLength());
- for(int i = 0; i < objects.dataLength(); i++) {
- ids.add(DBIDUtil.importInteger(startid + i));
+ public Event nextEvent() {
+ Event ev = source.nextEvent();
+ if(ev == Event.META_CHANGED) {
+ if(meta == null) {
+ meta = new BundleMeta();
+ meta.add(TypeUtil.DBID);
+ }
+ BundleMeta origmeta = source.getMeta();
+ // Note -1 for the injected DBID column
+ for(int i = meta.size() - 1; i < origmeta.size(); i++) {
+ meta.add(origmeta.get(i));
+ }
}
- bundle.appendColumn(TypeUtil.DBID, ids);
- // copy other columns
- for(int j = 0; j < objects.metaLength(); j++) {
- bundle.appendColumn(objects.meta(j), objects.getColumn(j));
+ return ev;
+ }
+
+ @Override
+ public Object data(int rnum) {
+ if(rnum == 0) {
+ DBID ret = DBIDUtil.importInteger(curid);
+ curid++;
+ return ret;
}
- return bundle;
+ return source.data(rnum - 1);
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterNoMissingValuesFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java
index ceb671df..da5f066f 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterNoMissingValuesFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -34,16 +34,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*/
-public class FilterNoMissingValuesFilter implements ObjectFilter {
+public class NoMissingValuesFilter implements ObjectFilter {
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(FilterNoMissingValuesFilter.class);
+ private static final Logging logger = Logging.getLogger(NoMissingValuesFilter.class);
/**
* Constructor.
*/
- public FilterNoMissingValuesFilter() {
+ public NoMissingValuesFilter() {
super();
}
@@ -82,7 +82,7 @@ public class FilterNoMissingValuesFilter implements ObjectFilter {
public static class Parameterizer extends AbstractParameterizer {
@Override
protected Object makeInstance() {
- return new FilterNoMissingValuesFilter();
+ return new NoMissingValuesFilter();
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
index 9275c3c2..264f58fd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
/**
@@ -34,7 +35,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
*
* @author Erich Schubert
*/
-public class NoOpFilter implements ObjectFilter {
+public class NoOpFilter extends AbstractStreamFilter {
/**
* Constructor.
*/
@@ -46,4 +47,19 @@ public class NoOpFilter implements ObjectFilter {
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
return objects;
}
-}
+
+ @Override
+ public BundleMeta getMeta() {
+ return source.getMeta();
+ }
+
+ @Override
+ public Object data(int rnum) {
+ return source.data(rnum);
+ }
+
+ @Override
+ public Event nextEvent() {
+ return source.nextEvent();
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
index 1d5c2ba9..b3670e9b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,6 +24,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
*/
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.utilities.InspectionUtilFrequentlyScanned;
/**
* Object filters as part of the input step.
@@ -34,7 +35,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
*
* @apiviz.uses MultipleObjectsBundle oneway - - «filters»
*/
-public interface ObjectFilter {
+public interface ObjectFilter extends InspectionUtilFrequentlyScanned {
/**
* Filter a set of object packages.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java
index 6618721f..a8bf2cec 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -50,10 +50,10 @@ public class ShuffleObjectsFilter implements ObjectFilter {
* database. If unused, no shuffling will be performed. Shuffling takes time
* linearly dependent from the size of the database.
* <p>
- * Key: {@code -dbc.seed}
+ * Key: {@code -shuffle.seed}
* </p>
*/
- public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("dbc.seed", "Seed for randomly shuffling the rows for the database. If the parameter is not set, no shuffling will be performed.");
+ public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("shuffle.seed", "Seed for randomly shuffling the rows for the database. If the parameter is not set, no shuffling will be performed.");
/**
* Seed for randomly shuffling the rows of the database. If null, no shuffling
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java
index 74bbe3ac..5aedc79c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorProjectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorProjectionFilter.java
index 7bbbab2d..06e686c3 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorProjectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorProjectionFilter.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -22,9 +22,9 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import gnu.trove.map.hash.TIntFloatHashMap;
+
import java.util.BitSet;
-import java.util.Collections;
-import java.util.Map;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
@@ -40,7 +40,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
* </p>
*
* @author Arthur Zimek
- *
*/
public class SparseFloatVectorProjectionFilter extends AbstractFeatureSelectionFilter<SparseFloatVector> {
/**
@@ -64,7 +63,7 @@ public class SparseFloatVectorProjectionFilter extends AbstractFeatureSelectionF
@Override
protected SimpleTypeInformation<? super SparseFloatVector> convertedType(SimpleTypeInformation<SparseFloatVector> in) {
- final Map<Integer, Float> emptyMap = Collections.emptyMap();
+ final TIntFloatHashMap emptyMap = new TIntFloatHashMap();
return new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, getDimensionality(), new SparseFloatVector(emptyMap, getDimensionality()));
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorRandomProjectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorRandomProjectionFilter.java
index f8e999b9..fbf26eea 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorRandomProjectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseFloatVectorRandomProjectionFilter.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -22,9 +22,6 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.Collections;
-import java.util.Map;
-
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
@@ -60,8 +57,8 @@ public class SparseFloatVectorRandomProjectionFilter extends AbstractRandomFeatu
@Override
protected SimpleTypeInformation<? super SparseFloatVector> convertedType(SimpleTypeInformation<SparseFloatVector> in) {
- final Map<Integer, Float> emptyMap = Collections.emptyMap();
- return new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, k, new SparseFloatVector(emptyMap, k));
+ initializeRandomAttributes(in);
+ return new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, k, new SparseFloatVector(SparseFloatVector.EMPTYMAP, k));
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java
new file mode 100644
index 00000000..482fc498
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java
@@ -0,0 +1,75 @@
+package de.lmu.ifi.dbs.elki.datasource.filter;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+
+/**
+ * Class that turns sparse float vectors into a proper vector field, by setting
+ * the maximum dimensionality for each vector.
+ *
+ * @author Erich Schubert
+ */
+public class SparseVectorFieldFilter extends AbstractConversionFilter<SparseFloatVector, SparseFloatVector> {
+ /**
+ * Maximum dimension
+ */
+ int maxdim = -1;
+
+ /**
+ * Constructor.
+ */
+ public SparseVectorFieldFilter() {
+ super();
+ }
+
+ @Override
+ protected boolean prepareStart(SimpleTypeInformation<SparseFloatVector> in) {
+ return true;
+ }
+
+ @Override
+ protected void prepareProcessInstance(SparseFloatVector obj) {
+ maxdim = Math.max(maxdim, obj.getDimensionality());
+ }
+
+ @Override
+ protected SparseFloatVector filterSingleObject(SparseFloatVector obj) {
+ assert(maxdim > 0);
+ obj.setDimensionality(maxdim);
+ return obj;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super SparseFloatVector> getInputTypeRestriction() {
+ return TypeUtil.SPARSE_FLOAT_FIELD;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super SparseFloatVector> convertedType(SimpleTypeInformation<SparseFloatVector> in) {
+ return new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, maxdim, SparseFloatVector.STATIC);
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java
index 25ab7e89..827a5011 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -82,8 +82,8 @@ public class SplitNumberVectorFilter<V extends NumberVector<V, ?>> implements Ob
final VectorFieldTypeInformation<V> vtype = VectorFieldTypeInformation.class.cast(type);
// Get the replacement type informations
- VectorFieldTypeInformation<V> type1 = new VectorFieldTypeInformation<V>(type.getRestrictionClass(), dims.length, dims.length);
- VectorFieldTypeInformation<V> type2 = new VectorFieldTypeInformation<V>(type.getRestrictionClass(), vtype.dimensionality() - dims.length, vtype.dimensionality() - dims.length);
+ VectorFieldTypeInformation<V> type1 = new VectorFieldTypeInformation<V>(type.getRestrictionClass(), type.getSerializer(), dims.length, dims.length);
+ VectorFieldTypeInformation<V> type2 = new VectorFieldTypeInformation<V>(type.getRestrictionClass(), type.getSerializer(), vtype.dimensionality() - dims.length, vtype.dimensionality() - dims.length);
final List<V> col1 = new ArrayList<V>(column.size());
final List<V> col2 = new ArrayList<V>(column.size());
bundle.appendColumn(type1, col1);
@@ -122,8 +122,8 @@ public class SplitNumberVectorFilter<V extends NumberVector<V, ?>> implements Ob
for(int d = 0; d < odims.length; d++) {
part2[d] = obj.doubleValue(odims[d]);
}
- col1.add(obj.newInstance(part1));
- col2.add(obj.newInstance(part2));
+ col1.add(obj.newNumberVector(part1));
+ col2.add(obj.newNumberVector(part2));
}
}
return bundle;
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
new file mode 100644
index 00000000..e40565f9
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
@@ -0,0 +1,43 @@
+package de.lmu.ifi.dbs.elki.datasource.filter;
+
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Streaming filters are often more efficient (less memory use) and can be used
+ * in more settings.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.uses BundleStreamSource - - «filters»
+ */
+public interface StreamFilter extends ObjectFilter, BundleStreamSource {
+ /**
+ * Connect to the previous stream.
+ *
+ * @param source Stream source
+ */
+ public void init(BundleStreamSource source);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
index ae75a979..3a629760 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,6 +25,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractConversionFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseErfNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java
index d365a9ae..c0f2a955 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseErfNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
/**
* Attribute-wise Normalization using the error function. This mostly makes
@@ -55,9 +55,9 @@ public class AttributeWiseErfNormalization<O extends NumberVector<O, ?>> extends
protected O filterSingleObject(O obj) {
double[] val = new double[obj.getDimensionality()];
for(int i = 0; i < val.length; i++) {
- val[i] = MathUtil.erf(obj.doubleValue(i + 1));
+ val[i] = NormalDistribution.erf(obj.doubleValue(i + 1));
}
- return obj.newInstance(val);
+ return obj.newNumberVector(val);
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java
index d9a636ec..4cf3c606 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseMinMaxNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,14 +24,13 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
*/
import java.util.ArrayList;
-import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
-import de.lmu.ifi.dbs.elki.utilities.Util;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeSetGlobalConstraint;
@@ -118,10 +117,13 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<V, ?>> exte
@Override
protected V filterSingleObject(V featureVector) {
double[] values = new double[featureVector.getDimensionality()];
+ if(minima.length != featureVector.getDimensionality()) {
+ throw new IllegalArgumentException("FeatureVectors and given Minima/Maxima differ in length.");
+ }
for(int d = 1; d <= featureVector.getDimensionality(); d++) {
values[d - 1] = (featureVector.doubleValue(d) - minima[d - 1]) / factor(d);
}
- return featureVector.newInstance(values);
+ return featureVector.newNumberVector(values);
}
@Override
@@ -131,7 +133,7 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<V, ?>> exte
for(int d = 1; d <= featureVector.getDimensionality(); d++) {
values[d - 1] = (featureVector.doubleValue(d) * (factor(d)) + minima[d - 1]);
}
- return featureVector.newInstance(values);
+ return featureVector.newNumberVector(values);
}
else {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + maxima.length);
@@ -214,13 +216,11 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<V, ?>> exte
super.makeOptions(config);
DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID, true);
if(config.grab(minimaP)) {
- List<Double> min_list = minimaP.getValue();
- minima = Util.unbox(min_list.toArray(new Double[min_list.size()]));
+ minima = ArrayLikeUtil.toPrimitiveDoubleArray(minimaP.getValue());
}
DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID, true);
if(config.grab(maximaP)) {
- List<Double> max_list = maximaP.getValue();
- maxima = Util.unbox(max_list.toArray(new Double[max_list.size()]));
+ maxima = ArrayLikeUtil.toPrimitiveDoubleArray(maximaP.getValue());
}
ArrayList<Parameter<?, ?>> global_1 = new ArrayList<Parameter<?, ?>>();
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java
index 3ae2fdad..52a0499f 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AttributeWiseVarianceNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
*/
import java.util.ArrayList;
-import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
@@ -33,7 +32,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
-import de.lmu.ifi.dbs.elki.utilities.Util;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.WrongParameterValueException;
@@ -145,7 +144,7 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<V, ?>> ex
for(int d = 1; d <= featureVector.getDimensionality(); d++) {
values[d - 1] = normalize(d - 1, featureVector.doubleValue(d));
}
- return featureVector.newInstance(values);
+ return featureVector.newNumberVector(values);
}
@Override
@@ -155,7 +154,7 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<V, ?>> ex
for(int d = 1; d <= featureVector.getDimensionality(); d++) {
values[d - 1] = restore(d - 1, featureVector.doubleValue(d));
}
- return featureVector.newInstance(values);
+ return featureVector.newNumberVector(values);
}
else {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
@@ -163,11 +162,21 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<V, ?>> ex
}
private double normalize(int d, double val) {
- return (val - mean[d]) / stddev[d];
+ if(mean.length == 1) {
+ return (val - mean[0]) / stddev[0];
+ }
+ else {
+ return (val - mean[d]) / stddev[d];
+ }
}
private double restore(int d, double val) {
- return (val * stddev[d]) + mean[d];
+ if(mean.length == 1) {
+ return (val * stddev[0]) + mean[0];
+ }
+ else {
+ return (val * stddev[d]) + mean[d];
+ }
}
@Override
@@ -236,11 +245,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<V, ?>> ex
config.grab(stddevP);
// Note: grab first, then use isDefined, to ensure the stddev is grabbed.
if(meanP.isDefined() && stddevP.isDefined()) {
- List<Double> mean_list = meanP.getValue();
- List<Double> stddev_list = stddevP.getValue();
-
- mean = Util.unbox(mean_list.toArray(new Double[mean_list.size()]));
- stddev = Util.unbox(stddev_list.toArray(new Double[stddev_list.size()]));
+ mean = ArrayLikeUtil.toPrimitiveDoubleArray(meanP.getValue());
+ stddev = ArrayLikeUtil.toPrimitiveDoubleArray(stddevP.getValue());
for(double d : stddev) {
if(d == 0) {
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/InverseDocumentFrequencyNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java
index d8ffd71c..41cce2b9 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/InverseDocumentFrequencyNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,10 +23,12 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import gnu.trove.iterator.TIntDoubleIterator;
+import gnu.trove.map.TIntDoubleMap;
+import gnu.trove.map.hash.TIntDoubleHashMap;
+import gnu.trove.map.hash.TIntFloatHashMap;
+
import java.util.BitSet;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
@@ -44,7 +46,7 @@ public class InverseDocumentFrequencyNormalization extends AbstractNormalization
/**
* The IDF storage
*/
- Map<Integer, Number> idf = new HashMap<Integer, Number>();
+ TIntDoubleMap idf = new TIntDoubleHashMap();
/**
* The number of objects in the dataset
@@ -88,18 +90,19 @@ public class InverseDocumentFrequencyNormalization extends AbstractNormalization
protected void prepareComplete() {
final double dbsize = objcnt;
// Compute IDF values
- for(Entry<Integer, Number> ent : idf.entrySet()) {
+ for(TIntDoubleIterator iter = idf.iterator(); iter.hasNext();) {
+ iter.advance();
// Note: dbsize is a double!
- ent.setValue(Math.log(dbsize / ent.getValue().intValue()));
+ iter.setValue(Math.log(dbsize / iter.value()));
}
}
@Override
protected SparseFloatVector filterSingleObject(SparseFloatVector featureVector) {
BitSet b = featureVector.getNotNullMask();
- Map<Integer, Float> vals = new HashMap<Integer, Float>();
+ TIntFloatHashMap vals = new TIntFloatHashMap();
for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) {
- vals.put(i, (float) (featureVector.doubleValue(i) * idf.get(i).doubleValue()));
+ vals.put(i, (float) (featureVector.doubleValue(i) * idf.get(i)));
}
return new SparseFloatVector(vals, featureVector.getDimensionality());
}
@@ -107,9 +110,9 @@ public class InverseDocumentFrequencyNormalization extends AbstractNormalization
@Override
public SparseFloatVector restore(SparseFloatVector featureVector) {
BitSet b = featureVector.getNotNullMask();
- Map<Integer, Float> vals = new HashMap<Integer, Float>();
+ TIntFloatHashMap vals = new TIntFloatHashMap();
for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) {
- vals.put(i, (float) (featureVector.doubleValue(i) / idf.get(i).doubleValue()));
+ vals.put(i, (float) (featureVector.doubleValue(i) / idf.get(i)));
}
return new SparseFloatVector(vals, featureVector.getDimensionality());
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java
new file mode 100644
index 00000000..6de7eaba
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java
@@ -0,0 +1,115 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Class to perform a normalization on vectors to norm 1.
+ *
+ * @author Heidi Kolb
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class LengthNormalization<V extends NumberVector<V, ?>> extends AbstractNormalization<V> {
+ /**
+ * Norm to use
+ */
+ DoubleNorm<? super V> norm;
+
+ /**
+ * Constructor
+ *
+ * @param norm Norm to use
+ */
+ public LengthNormalization(DoubleNorm<? super V> norm) {
+ super();
+ this.norm = norm;
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ final double d = norm.doubleNorm(featureVector);
+ return featureVector.newNumberVector(featureVector.getColumnVector().timesEquals(1 / d).getArrayRef());
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ // TODO.
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ /**
+ * Option ID for normalization norm
+ */
+ public static final OptionID NORM_ID = OptionID.getOrCreateOptionID("normalization.norm", "Norm (length function) to use for computing the vector length.");
+
+ /**
+ * Norm to use
+ */
+ DoubleNorm<? super V> norm;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<DoubleNorm<? super V>> normP = new ObjectParameter<DoubleNorm<? super V>>(NORM_ID, DoubleNorm.class, EuclideanDistanceFunction.class);
+ if(config.grab(normP)) {
+ norm = normP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected LengthNormalization<V> makeInstance() {
+ return new LengthNormalization<V>(norm);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NonNumericFeaturesException.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
index 3206518d..9f26482a 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NonNumericFeaturesException.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/Normalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
index 417c4456..96f6bdc1 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/Normalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,6 +24,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
*/
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java
new file mode 100644
index 00000000..be8c1166
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java
@@ -0,0 +1,113 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.IntegerVector;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
+
+/**
+ * Normalize vectors according to their rank in the attributes.
+ *
+ * Note: ranks are multiplied by 2, to be able to give ties an integer rank.
+ * (e.g. first two records are tied at "1" then, followed by the next on "4")
+ *
+ * @author Erich Schubert
+ */
+public class RankTieNormalization implements ObjectFilter {
+ /**
+ * Constructor.
+ */
+ public RankTieNormalization() {
+ super();
+ }
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ final int len = objects.dataLength();
+ MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+
+ for(int r = 0; r < objects.metaLength(); r++) {
+ final SimpleTypeInformation<?> type = objects.meta(r);
+ final List<?> column = objects.getColumn(r);
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ bundle.appendColumn(type, column);
+ continue;
+ }
+ @SuppressWarnings("unchecked")
+ final List<? extends NumberVector<?, ?>> castColumn = (List<? extends NumberVector<?, ?>>) column;
+ // Get the replacement type information
+ final int dim = ((VectorFieldTypeInformation<?>) type).dimensionality();
+ final VectorFieldTypeInformation<IntegerVector> outType = new VectorFieldTypeInformation<IntegerVector>(IntegerVector.class, dim, IntegerVector.STATIC);
+
+ // Output vectors
+ int[][] posvecs = new int[len][dim];
+ // Sort for each dimension
+ // TODO: an int[] array would be enough, if we could use a comparator...
+ DoubleIntPair[] sorter = new DoubleIntPair[len];
+ for(int i = 0; i < sorter.length; i++) {
+ sorter[i] = new DoubleIntPair(Double.NaN, -1);
+ }
+ for(int d = 1; d <= dim; d++) {
+ // fill array
+ for(int i = 0; i < sorter.length; i++) {
+ sorter[i].first = castColumn.get(i).doubleValue(d);
+ sorter[i].second = i;
+ }
+ // Sort
+ Arrays.sort(sorter);
+ // Transfer positions to output vectors
+ for(int sta = 0; sta < sorter.length;) {
+ // Compute ties
+ int end = sta + 1;
+ while(end < sorter.length && sorter[sta].first == sorter[end].first) {
+ end++;
+ }
+ final int pos = (sta + end - 1);
+ for(int i = sta; i < end; i++) {
+ posvecs[sorter[i].second][d - 1] = pos;
+ }
+ sta = end;
+ }
+ }
+
+ // Prepare output data
+ final List<IntegerVector> outColumn = new ArrayList<IntegerVector>(len);
+ for(int i = 0; i < len; i++) {
+ outColumn.add(new IntegerVector(posvecs[i]));
+ }
+ bundle.appendColumn(outType, outColumn);
+ }
+ return bundle;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/TFIDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
index 65fab4cb..e279c42c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/TFIDFNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,9 +23,9 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import gnu.trove.map.hash.TIntFloatHashMap;
+
import java.util.BitSet;
-import java.util.HashMap;
-import java.util.Map;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
@@ -58,9 +58,9 @@ public class TFIDFNormalization extends InverseDocumentFrequencyNormalization {
if(sum <= 0) {
sum = 1.0;
}
- Map<Integer, Float> vals = new HashMap<Integer, Float>();
+ TIntFloatHashMap vals = new TIntFloatHashMap();
for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) {
- vals.put(i, (float) (featureVector.doubleValue(i) / sum * idf.get(i).doubleValue()));
+ vals.put(i, (float) (featureVector.doubleValue(i) / sum * idf.get(i)));
}
return new SparseFloatVector(vals, featureVector.getDimensionality());
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
new file mode 100644
index 00000000..c0c10a7c
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>Data normalization.</p>
+ */
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2012
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
index 0379c7aa..ca52f814 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team