summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/AbstractDatabaseConnection.java102
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/ArrayAdapterDatabaseConnection.java19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/BundleDatabaseConnection.java13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java37
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/DBIDRangeDatabaseConnection.java21
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/DatabaseConnection.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/EmptyDatabaseConnection.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/ExternalIDJoinDatabaseConnection.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/FileBasedDatabaseConnection.java47
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/GeneratorXMLDatabaseConnection.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/InputStreamDatabaseConnection.java48
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/LabelJoinDatabaseConnection.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/MultipleObjectsBundleDatabaseConnection.java54
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/PresortedBlindJoinDatabaseConnection.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/RandomDoubleVectorDatabaseConnection.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleMeta.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleReader.java113
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleStreamSource.java29
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleWriter.java74
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/MultipleObjectsBundle.java102
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/ObjectBundle.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/SingleObjectBundle.java34
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/StreamFromBundle.java23
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/bundle/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java23
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java31
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java58
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/DropNaNFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java)56
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/NoMissingValuesFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java)9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/ReplaceNaNWithRandomFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java)65
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java219
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java77
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java326
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java)171
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java)28
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java)81
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java207
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java)58
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java)96
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java)83
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java)19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java97
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java159
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java177
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java)37
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java119
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java3
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ByLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java)26
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/RandomSamplingStreamFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java)9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ShuffleObjectsFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java)23
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/SortByLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java)19
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java33
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/HistogramJitterFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java)13
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java436
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java)9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFromPatternFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java)44
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ExternalIDFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java)10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java124
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SparseVectorFieldFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java)12
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SplitNumberVectorFilter.java (renamed from src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java)17
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java38
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java88
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java48
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java95
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser.java28
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/ClusteringVectorParser.java268
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java104
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java99
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java151
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java194
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SimpleTransactionParser.java200
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java143
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java97
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java37
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java23
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java70
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/Tokenizer.java230
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java6
108 files changed, 4157 insertions, 1843 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/AbstractDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/AbstractDatabaseConnection.java
index 77cdb12c..9da9a550 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/AbstractDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/AbstractDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,6 @@ import java.util.List;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.datasource.filter.StreamFilter;
import de.lmu.ifi.dbs.elki.datasource.parser.Parser;
@@ -73,83 +72,50 @@ public abstract class AbstractDatabaseConnection implements DatabaseConnection {
* @param bundle the objects to process
* @return processed objects
*/
- protected MultipleObjectsBundle invokeFilters(MultipleObjectsBundle bundle) {
- BundleStreamSource prevs = null;
- MultipleObjectsBundle prevb = bundle;
- if(filters != null) {
- for(ObjectFilter filter : filters) {
- if(filter instanceof StreamFilter) {
- StreamFilter sfilter = (StreamFilter) filter;
- if(prevs != null) {
- sfilter.init(prevs);
- }
- else {
- sfilter.init(new StreamFromBundle(prevb));
- }
- prevs = sfilter;
- prevb = null;
- }
- else {
- if(prevs != null) {
- prevb = filter.filter(MultipleObjectsBundle.fromStream(prevs));
- prevs = null;
- }
- else {
- prevb = filter.filter(prevb);
- prevs = null;
- }
- }
- }
- }
- if(prevb != null) {
- return prevb;
+ protected MultipleObjectsBundle invokeBundleFilters(MultipleObjectsBundle bundle) {
+ if(filters == null) {
+ return bundle;
}
- else {
- return MultipleObjectsBundle.fromStream(prevs);
+ // We dynamically switch between streaming and bundle operations.
+ BundleStreamSource stream = null;
+ for(ObjectFilter filter : filters) {
+ if(filter instanceof StreamFilter) {
+ StreamFilter sfilter = (StreamFilter) filter;
+ stream = sfilter.init((stream != null) ? stream : bundle.asStream());
+ bundle = null; // No longer a bundle
+ }
+ else {
+ bundle = filter.filter((bundle != null) ? bundle : stream.asMultipleObjectsBundle());
+ stream = null; // No longer a stream
+ }
}
+ return (bundle != null) ? bundle : stream.asMultipleObjectsBundle();
}
/**
* Transforms the specified list of objects and their labels into a list of
* objects and their associations.
*
- * @param bundle the objects to process
+ * @param stream the objects to process
* @return processed objects
*/
- protected BundleStreamSource invokeFilters(BundleStreamSource bundle) {
- BundleStreamSource prevs = bundle;
- MultipleObjectsBundle prevb = null;
- if(filters != null) {
- for(ObjectFilter filter : filters) {
- if(filter instanceof StreamFilter) {
- StreamFilter sfilter = (StreamFilter) filter;
- if(prevs != null) {
- sfilter.init(prevs);
- }
- else {
- sfilter.init(new StreamFromBundle(prevb));
- }
- prevs = sfilter;
- prevb = null;
- }
- else {
- if(prevs != null) {
- prevb = filter.filter(MultipleObjectsBundle.fromStream(prevs));
- prevs = null;
- }
- else {
- prevb = filter.filter(prevb);
- prevs = null;
- }
- }
- }
- }
- if(prevs != null) {
- return prevs;
+ protected BundleStreamSource invokeStreamFilters(BundleStreamSource stream) {
+ if(filters == null) {
+ return stream;
}
- else {
- return new StreamFromBundle(prevb);
+ // We dynamically switch between streaming and bundle operations.
+ MultipleObjectsBundle bundle = null;
+ for(ObjectFilter filter : filters) {
+ if(filter instanceof StreamFilter) {
+ stream = ((StreamFilter) filter).init((stream != null) ? stream : bundle.asStream());
+ bundle = null;
+ }
+ else {
+ bundle = filter.filter((bundle != null) ? bundle : stream.asMultipleObjectsBundle());
+ stream = null;
+ }
}
+ return (stream != null) ? stream : bundle.asStream();
}
/**
@@ -187,7 +153,7 @@ public abstract class AbstractDatabaseConnection implements DatabaseConnection {
* Filters
*/
protected List<ObjectFilter> filters;
-
+
/**
* Parser to use
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/ArrayAdapterDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/ArrayAdapterDatabaseConnection.java
index 1779f851..244033c2 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/ArrayAdapterDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/ArrayAdapterDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,8 +31,7 @@ import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDFactory;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -97,11 +96,7 @@ public class ArrayAdapterDatabaseConnection implements DatabaseConnection {
public MultipleObjectsBundle loadData() {
MultipleObjectsBundle b = new MultipleObjectsBundle();
if(startid != null) {
- List<DBID> ids = new ArrayList<>(data.length);
- for(int i = 0; i < data.length; i++) {
- ids.add(DBIDUtil.importInteger(startid.intValue() + i));
- }
- b.appendColumn(TypeUtil.DBID, Arrays.asList(labels));
+ b.setDBIDs(DBIDFactory.FACTORY.generateStaticDBIDRange(startid, data.length));
}
int mind = Integer.MAX_VALUE;
@@ -112,13 +107,7 @@ public class ArrayAdapterDatabaseConnection implements DatabaseConnection {
maxd = Math.max(maxd, data[i].length);
vecs.add(new DoubleVector(data[i]));
}
- SimpleTypeInformation<DoubleVector> type;
- if(mind == maxd) {
- type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, mind);
- }
- else {
- type = new SimpleTypeInformation<>(DoubleVector.class);
- }
+ SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, mind, maxd, DoubleVector.FACTORY.getDefaultSerializer());
b.appendColumn(type, vecs);
if(labels != null) {
if(labels.length != data.length) {
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/BundleDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/BundleDatabaseConnection.java
index c8821bc9..7407bf55 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/BundleDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/BundleDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,7 +29,6 @@ import java.nio.channels.FileChannel;
import java.util.List;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleReader;
-import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -47,7 +46,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileParameter;
*
* @author Erich Schubert
*
- * @apiviz.composedOf BundleStreamSource
+ * @apiviz.composedOf BundleStreamSource
*/
public class BundleDatabaseConnection extends AbstractDatabaseConnection {
/**
@@ -76,12 +75,12 @@ public class BundleDatabaseConnection extends AbstractDatabaseConnection {
try {
FileInputStream fis = new FileInputStream(infile);
FileChannel channel = fis.getChannel();
- BundleStreamSource src = invokeFilters(new BundleReader(channel));
- MultipleObjectsBundle bundle = MultipleObjectsBundle.fromStream(src);
+ MultipleObjectsBundle bundle = invokeStreamFilters(new BundleReader(channel)).asMultipleObjectsBundle();
channel.close();
fis.close();
return bundle;
- } catch (IOException e) {
+ }
+ catch(IOException e) {
throw new AbortException("IO error loading bundle", e);
}
}
@@ -114,7 +113,7 @@ public class BundleDatabaseConnection extends AbstractDatabaseConnection {
super.makeOptions(config);
configFilters(config);
FileParameter infileP = new FileParameter(BUNDLE_ID, FileParameter.FileType.INPUT_FILE);
- if (config.grab(infileP)) {
+ if(config.grab(infileP)) {
infile = infileP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java
index 22aadc08..110e131a 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -36,7 +36,6 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource.Event;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser;
import de.lmu.ifi.dbs.elki.datasource.parser.Parser;
@@ -86,35 +85,37 @@ public class ConcatenateFilesDatabaseConnection extends AbstractDatabaseConnecti
public MultipleObjectsBundle loadData() {
MultipleObjectsBundle objects = new MultipleObjectsBundle();
objects.appendColumn(TypeUtil.STRING, new ArrayList<>());
- for (File file : files) {
+ for(File file : files) {
String filestr = file.getPath();
try {
InputStream inputStream = new BufferedInputStream(new FileInputStream(file));
inputStream = FileUtil.tryGzipInput(inputStream);
final BundleStreamSource source;
- if (parser instanceof StreamingParser) {
+ if(parser instanceof StreamingParser) {
final StreamingParser streamParser = (StreamingParser) parser;
streamParser.initStream(inputStream);
source = streamParser;
- } else {
+ }
+ else {
MultipleObjectsBundle parsingResult = parser.parse(inputStream);
// normalize objects and transform labels
- source = new StreamFromBundle(parsingResult);
+ source = parsingResult.asStream();
}
BundleMeta meta = null; // NullPointerException on invalid streams
- loop: for (Event e = source.nextEvent();; e = source.nextEvent()) {
- switch(e) {
+ loop: for(Event e = source.nextEvent();; e = source.nextEvent()) {
+ switch(e){
case END_OF_STREAM:
break loop;
case META_CHANGED:
meta = source.getMeta();
- for (int i = 0; i < meta.size(); i++) {
- if (i + 1 >= objects.metaLength()) {
+ for(int i = 0; i < meta.size(); i++) {
+ if(i + 1 >= objects.metaLength()) {
objects.appendColumn(meta.get(i), new ArrayList<>());
- } else {
+ }
+ else {
// Ensure compatibility:
- if (!objects.meta(i + 1).isAssignableFromType(meta.get(i))) {
+ if(!objects.meta(i + 1).isAssignableFromType(meta.get(i))) {
throw new AbortException("Incompatible files loaded. Cannot concatenate with unaligned columns, please preprocess manually.");
}
}
@@ -123,22 +124,24 @@ public class ConcatenateFilesDatabaseConnection extends AbstractDatabaseConnecti
case NEXT_OBJECT:
Object[] o = new Object[objects.metaLength()];
o[0] = filestr;
- for (int i = 0; i < meta.size(); i++) {
+ for(int i = 0; i < meta.size(); i++) {
o[i + 1] = source.data(i);
}
objects.appendSimple(o);
break; // switch
}
}
- } catch (IOException e) {
+ }
+ catch(IOException e) {
throw new AbortException("Loading file " + filestr + " failed: " + e.toString(), e);
}
}
+ parser.cleanup();
// Invoke filters
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debugFine("Invoking filters.");
}
- return invokeFilters(objects);
+ return invokeBundleFilters(objects);
}
@Override
@@ -163,7 +166,7 @@ public class ConcatenateFilesDatabaseConnection extends AbstractDatabaseConnecti
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
FileListParameter filesP = new FileListParameter(FileBasedDatabaseConnection.Parameterizer.INPUT_ID, FilesType.INPUT_FILES);
- if (config.grab(filesP)) {
+ if(config.grab(filesP)) {
files = filesP.getValue();
}
configFilters(config);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/DBIDRangeDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/DBIDRangeDatabaseConnection.java
index 2e7f59e7..2883a008 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/DBIDRangeDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/DBIDRangeDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,12 +23,7 @@ package de.lmu.ifi.dbs.elki.datasource;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.ArrayList;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDFactory;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -70,11 +65,7 @@ public class DBIDRangeDatabaseConnection implements DatabaseConnection {
@Override
public MultipleObjectsBundle loadData() {
MultipleObjectsBundle b = new MultipleObjectsBundle();
- List<DBID> ids = new ArrayList<>(count);
- for(int i = 0; i < count; i++) {
- ids.add(DBIDUtil.importInteger(start + i));
- }
- b.appendColumn(TypeUtil.DBID, ids);
+ b.setDBIDs(DBIDFactory.FACTORY.generateStaticDBIDRange(start, count));
return b;
}
@@ -109,13 +100,13 @@ public class DBIDRangeDatabaseConnection implements DatabaseConnection {
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter startp = new IntParameter(START_ID, Integer.valueOf(0));
+ IntParameter startp = new IntParameter(START_ID, 0);
if(config.grab(startp)) {
- start = startp.getValue().intValue();
+ start = startp.intValue();
}
IntParameter countp = new IntParameter(COUNT_ID);
if(config.grab(countp)) {
- count = countp.getValue().intValue();
+ count = countp.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/DatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/DatabaseConnection.java
index 6fe3ae39..ac6718c7 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/DatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/DatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.datasource;
*/
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
/**
* DatabaseConnection is used to load data into a database.
@@ -39,7 +38,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
* @apiviz.landmark
* @apiviz.has MultipleObjectsBundle
*/
-public interface DatabaseConnection extends Parameterizable {
+public interface DatabaseConnection {
/**
* Returns the initial data for a database.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/EmptyDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/EmptyDatabaseConnection.java
index 70b1da50..1c6e7566 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/EmptyDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/EmptyDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/ExternalIDJoinDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/ExternalIDJoinDatabaseConnection.java
index 38aefaca..509a6c28 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/ExternalIDJoinDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/ExternalIDJoinDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -37,7 +37,6 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParameter;
@@ -48,7 +47,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParamet
*
* @apiviz.uses ExternalID
*/
-public class ExternalIDJoinDatabaseConnection extends AbstractDatabaseConnection implements Parameterizable {
+public class ExternalIDJoinDatabaseConnection extends AbstractDatabaseConnection {
/**
* Logger
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/FileBasedDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/FileBasedDatabaseConnection.java
index f2ced700..300b34d2 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/FileBasedDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/FileBasedDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -39,7 +39,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileParameter;
/**
- * Provides a file based database connection based on the parser to be set.
+ * File based database connection based on the parser to be set.
*
* @author Arthur Zimek
*
@@ -51,7 +51,41 @@ public class FileBasedDatabaseConnection extends InputStreamDatabaseConnection {
*
* @param filters Filters, can be null
* @param parser the parser to provide a database
- * @param in the input stream to parse from.
+ * @param infile File to load the data from
+ */
+ public FileBasedDatabaseConnection(List<ObjectFilter> filters, Parser parser, File infile) {
+ super(filters, parser);
+ try {
+ this.in = new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(infile)));
+ }
+ catch(IOException e) {
+ throw new AbortException("Could not load input file: " + infile, e);
+ }
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param filters Filters, can be null
+ * @param parser the parser to provide a database
+ * @param infile File to load the data from
+ */
+ public FileBasedDatabaseConnection(List<ObjectFilter> filters, Parser parser, String infile) {
+ super(filters, parser);
+ try {
+ this.in = new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(infile)));
+ }
+ catch(IOException e) {
+ throw new AbortException("Could not load input file: " + infile, e);
+ }
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param filters Filters, can be null
+ * @param parser the parser to provide a database
+ * @param in Input stream
*/
public FileBasedDatabaseConnection(List<ObjectFilter> filters, Parser parser, InputStream in) {
super(filters, parser);
@@ -91,12 +125,7 @@ public class FileBasedDatabaseConnection extends InputStreamDatabaseConnection {
@Override
protected FileBasedDatabaseConnection makeInstance() {
- try {
- return new FileBasedDatabaseConnection(filters, parser, new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(infile))));
- }
- catch(IOException e) {
- throw new AbortException("Input file could not be opened.", e);
- }
+ return new FileBasedDatabaseConnection(filters, parser, infile);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/GeneratorXMLDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/GeneratorXMLDatabaseConnection.java
index 57bfd6bc..2c3e2019 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/GeneratorXMLDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/GeneratorXMLDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/InputStreamDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/InputStreamDatabaseConnection.java
index 0e99b421..21439807 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/InputStreamDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/InputStreamDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -38,8 +38,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
/**
- * Provides a database connection expecting input from an input stream such as
- * stdin.
+ * Database connection expecting input from an input stream such as stdin.
*
* @author Arthur Zimek
*
@@ -80,48 +79,39 @@ public class InputStreamDatabaseConnection extends AbstractDatabaseConnection {
if(LOG.isDebugging()) {
LOG.debugFine("Invoking parsers.");
}
+ // Streaming parsers may yield to stream filters immediately.
if(parser instanceof StreamingParser) {
- final StreamingParser streamParser = (StreamingParser)parser;
+ final StreamingParser streamParser = (StreamingParser) parser;
streamParser.initStream(in);
-
// normalize objects and transform labels
if(LOG.isDebugging()) {
- LOG.debugFine("Invoking filters.");
- }
- Duration duration = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".load") : null;
- if (duration != null) {
- duration.begin();
+ LOG.debugFine("Parsing as stream.");
}
- MultipleObjectsBundle objects = MultipleObjectsBundle.fromStream(invokeFilters(streamParser));
- if (duration != null) {
- duration.end();
- LOG.statistics(duration);
+ Duration duration = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".load").begin() : null;
+ MultipleObjectsBundle objects = invokeStreamFilters(streamParser).asMultipleObjectsBundle();
+ parser.cleanup();
+ if(duration != null) {
+ LOG.statistics(duration.end());
}
return objects;
}
else {
- Duration duration = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".parse") : null;
- if (duration != null) {
- duration.begin();
- }
+ // For non-streaming parsers, we first parse, then filter
+ Duration duration = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".parse").begin() : null;
MultipleObjectsBundle parsingResult = parser.parse(in);
- if (duration != null) {
- duration.end();
- LOG.statistics(duration);
+ parser.cleanup();
+ if(duration != null) {
+ LOG.statistics(duration.end());
}
// normalize objects and transform labels
if(LOG.isDebugging()) {
LOG.debugFine("Invoking filters.");
}
- Duration fduration = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".filter") : null;
- if (fduration != null) {
- fduration.begin();
- }
- MultipleObjectsBundle objects = invokeFilters(parsingResult);
- if (fduration != null) {
- fduration.end();
- LOG.statistics(fduration);
+ Duration fduration = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".filter").begin() : null;
+ MultipleObjectsBundle objects = invokeBundleFilters(parsingResult);
+ if(fduration != null) {
+ LOG.statistics(fduration.end());
}
return objects;
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/LabelJoinDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/LabelJoinDatabaseConnection.java
index fc28b989..de581230 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/LabelJoinDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/LabelJoinDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -37,7 +37,6 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParameter;
@@ -48,7 +47,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParamet
*
* @apiviz.uses LabelList
*/
-public class LabelJoinDatabaseConnection extends AbstractDatabaseConnection implements Parameterizable {
+public class LabelJoinDatabaseConnection extends AbstractDatabaseConnection {
/**
* Logger
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/MultipleObjectsBundleDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/MultipleObjectsBundleDatabaseConnection.java
new file mode 100644
index 00000000..202d1871
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/MultipleObjectsBundleDatabaseConnection.java
@@ -0,0 +1,54 @@
+package de.lmu.ifi.dbs.elki.datasource;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+
+/**
+ * Data source to feed a precomputed {@link MultipleObjectsBundle} into a
+ * database.
+ *
+ * @author Erich Schubert
+ */
+public class MultipleObjectsBundleDatabaseConnection implements DatabaseConnection {
+ /**
+ * Bundle.
+ */
+ MultipleObjectsBundle bundle;
+
+ /**
+ * Constructor.
+ *
+ * @param bundle Existing bundle.
+ */
+ public MultipleObjectsBundleDatabaseConnection(MultipleObjectsBundle bundle) {
+ super();
+ this.bundle = bundle;
+ }
+
+ @Override
+ public MultipleObjectsBundle loadData() {
+ return bundle;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/PresortedBlindJoinDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/PresortedBlindJoinDatabaseConnection.java
index 4f52f5f2..b51a23df 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/PresortedBlindJoinDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/PresortedBlindJoinDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -32,7 +32,6 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParameter;
@@ -43,7 +42,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParamet
* @author Erich Schubert
*/
@Description("Blindly joins multiple data sources, assuming they are ordered the same way.")
-public class PresortedBlindJoinDatabaseConnection extends AbstractDatabaseConnection implements Parameterizable {
+public class PresortedBlindJoinDatabaseConnection extends AbstractDatabaseConnection {
/**
* Logger
*/
@@ -84,7 +83,7 @@ public class PresortedBlindJoinDatabaseConnection extends AbstractDatabaseConnec
}
}
- return invokeFilters(first);
+ return invokeBundleFilters(first);
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/RandomDoubleVectorDatabaseConnection.java b/src/de/lmu/ifi/dbs/elki/datasource/RandomDoubleVectorDatabaseConnection.java
index 2f3a943b..29d7f2d7 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/RandomDoubleVectorDatabaseConnection.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/RandomDoubleVectorDatabaseConnection.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,7 +33,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -154,11 +154,11 @@ public class RandomDoubleVectorDatabaseConnection extends AbstractDatabaseConnec
configFilters(config);
IntParameter dimParam = new IntParameter(DIM_ID);
if(config.grab(dimParam)) {
- dim = dimParam.getValue().intValue();
+ dim = dimParam.intValue();
}
IntParameter sizeParam = new IntParameter(SIZE_ID);
if(config.grab(sizeParam)) {
- size = sizeParam.getValue().intValue();
+ size = sizeParam.intValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
if(config.grab(rndP)) {
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleMeta.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleMeta.java
index 442c00e8..7b148f3c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleMeta.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleMeta.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleReader.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleReader.java
index 4f57cac8..da72ac1b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleReader.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleReader.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,13 +26,15 @@ import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
-import java.util.ArrayList;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformationSerializer;
-import de.lmu.ifi.dbs.elki.persistent.ByteArrayUtil;
-import de.lmu.ifi.dbs.elki.persistent.ByteBufferSerializer;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil;
+import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer;
/**
* Read an ELKI bundle file into a data stream.
@@ -41,7 +43,8 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
*
* @author Erich Schubert
*
- * @apiviz.uses FileChannel
+ * @apiviz.uses MappedByteBuffer - - «reads»
+ * @apiviz.uses FileChannel - - «reads»
*/
public class BundleReader implements BundleStreamSource {
/**
@@ -52,7 +55,7 @@ public class BundleReader implements BundleStreamSource {
/**
* The stream buffer.
*/
- MappedByteBuffer buffer;
+ MappedByteBuffer buffer = null;
/**
* Bundle metadata.
@@ -62,17 +65,33 @@ public class BundleReader implements BundleStreamSource {
/**
* Input channel.
*/
- FileChannel input;
+ FileChannel input = null;
/**
* Serializers to use.
*/
- ArrayList<ByteBufferSerializer<?>> sers;
+ ByteBufferSerializer<?>[] sers;
/**
* Current object.
*/
- ArrayList<Object> data;
+ Object[] data;
+
+ /**
+ * Whether or not we have DBIDs.
+ */
+ boolean hasids = false;
+
+ /**
+ * Constructor.
+ *
+ * @param buffer Input buffer
+ */
+ public BundleReader(MappedByteBuffer buffer) {
+ super();
+ this.buffer = buffer;
+ this.input = null;
+ }
/**
* Constructor.
@@ -86,7 +105,7 @@ public class BundleReader implements BundleStreamSource {
@Override
public BundleMeta getMeta() {
- if (meta == null) {
+ if(meta == null) {
openBuffer();
readMeta();
}
@@ -97,10 +116,13 @@ public class BundleReader implements BundleStreamSource {
* Map the input file.
*/
void openBuffer() {
- try {
- buffer = input.map(MapMode.READ_ONLY, 0, input.size());
- } catch (IOException e) {
- throw new AbortException("Cannot map input bundle.", e);
+ if(buffer == null) {
+ try {
+ buffer = input.map(MapMode.READ_ONLY, 0, input.size());
+ }
+ catch(IOException e) {
+ throw new AbortException("Cannot map input bundle.", e);
+ }
}
}
@@ -109,23 +131,30 @@ public class BundleReader implements BundleStreamSource {
*/
void readMeta() {
final int check = buffer.getInt();
- if (check != MAGIC) {
+ if(check != MAGIC) {
throw new AbortException("File does not start with expected magic.");
}
final int nummeta = buffer.getInt();
- assert (nummeta > 0);
+ assert (nummeta > 0) : "Empty bundle?";
meta = new BundleMeta(nummeta);
- sers = new ArrayList<>(nummeta);
- data = new ArrayList<>(nummeta);
- for (int i = 0; i < nummeta; i++) {
+ sers = new ByteBufferSerializer<?>[nummeta];
+ data = new Object[nummeta];
+ for(int i = 0; i < nummeta; i++) {
try {
@SuppressWarnings("unchecked")
SimpleTypeInformation<? extends Object> type = (SimpleTypeInformation<? extends Object>) TypeInformationSerializer.STATIC.fromByteBuffer(buffer);
- meta.add(type);
- sers.add(type.getSerializer());
- } catch (UnsupportedOperationException e) {
- throw new AbortException("Deserialization failed: "+e.getMessage(), e);
- } catch (IOException e) {
+ sers[i] = type.getSerializer();
+ if(i == 0 && TypeUtil.DBID.isAssignableFromType(type)) {
+ hasids = true;
+ }
+ else {
+ meta.add(type);
+ }
+ }
+ catch(UnsupportedOperationException e) {
+ throw new AbortException("Deserialization failed: " + e.getMessage(), e);
+ }
+ catch(IOException e) {
throw new AbortException("IO error", e);
}
}
@@ -135,13 +164,14 @@ public class BundleReader implements BundleStreamSource {
* Read an object.
*/
void readObject() {
- data.clear();
- for (ByteBufferSerializer<?> ser : sers) {
+ for(int i = 0; i < sers.length; ++i) {
try {
- data.add(ser.fromByteBuffer(buffer));
- } catch (UnsupportedOperationException e) {
+ data[i] = sers[i].fromByteBuffer(buffer);
+ }
+ catch(UnsupportedOperationException e) {
throw new AbortException("Deserialization failed.", e);
- } catch (IOException e) {
+ }
+ catch(IOException e) {
throw new AbortException("IO error", e);
}
}
@@ -150,10 +180,10 @@ public class BundleReader implements BundleStreamSource {
@Override
public Event nextEvent() {
// Send initial meta
- if (meta == null) {
+ if(meta == null) {
return Event.META_CHANGED;
}
- if (buffer.remaining() == 0) {
+ if(buffer.remaining() == 0) {
ByteArrayUtil.unmapByteBuffer(buffer);
return Event.END_OF_STREAM;
}
@@ -163,6 +193,25 @@ public class BundleReader implements BundleStreamSource {
@Override
public Object data(int rnum) {
- return data.get(rnum);
+ return data[!hasids ? rnum : (rnum + 1)];
+ }
+
+ @Override
+ public boolean hasDBIDs() {
+ return hasids;
+ }
+
+ @Override
+ public boolean assignDBID(DBIDVar var) {
+ if(!hasids) {
+ return false;
+ }
+ var.set((DBID) data[0]);
+ return true;
+ }
+
+ @Override
+ public MultipleObjectsBundle asMultipleObjectsBundle() {
+ return MultipleObjectsBundle.fromStream(this);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleStreamSource.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleStreamSource.java
index cbac6134..bffbd6a0 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleStreamSource.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleStreamSource.java
@@ -1,10 +1,12 @@
package de.lmu.ifi.dbs.elki.datasource.bundle;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,14 +31,13 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
* @author Erich Schubert
*
* @apiviz.composedOf BundleMeta
+ * @apiviz.has BundleStreamSource.Event
*/
public interface BundleStreamSource {
/**
* Events
*
* @author Erich Schubert
- *
- * @apiviz.exclude
*/
public static enum Event {
// Metadata has changed
@@ -63,9 +64,31 @@ public interface BundleStreamSource {
public Object data(int rnum);
/**
+ * Indicate whether the stream contains DBIDs.
+ *
+ * @return {@code true} if the stream contains DBIDs.
+ */
+ public boolean hasDBIDs();
+
+ /**
+ * Assign the current object ID to a {@link DBIDVar}.
+ *
+ * @param var Variable to assign the object id to
+ * @return {@code false} when no object id is available
+ */
+ public boolean assignDBID(DBIDVar var);
+
+ /**
* Get the next event
*
* @return Event type
*/
public Event nextEvent();
+
+ /**
+ * Return (or collect) the stream as bundle.
+ *
+ * @return Bundle
+ */
+ public MultipleObjectsBundle asMultipleObjectsBundle();
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleWriter.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleWriter.java
index 4b7c4a3d..dbad6794 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleWriter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/BundleWriter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,9 +29,13 @@ import java.nio.channels.WritableByteChannel;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformationSerializer;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.persistent.ByteBufferSerializer;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer;
/**
* Write an object bundle stream to a file channel.
@@ -40,8 +44,8 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
*
* @author Erich Schubert
*
- * @apiviz.uses BundleStreamSource
- * @apiviz.uses WritableByteChannel
+ * @apiviz.uses BundleStreamSource - - «reads»
+ * @apiviz.uses WritableByteChannel - - «writes»
*/
public class BundleWriter {
/**
@@ -69,22 +73,36 @@ public class BundleWriter {
public void writeBundleStream(BundleStreamSource source, WritableByteChannel output) throws IOException {
ByteBuffer buffer = ByteBuffer.allocateDirect(INITIAL_BUFFER);
- ByteBufferSerializer<Object>[] serializers = null;
- loop: while (true) {
+ DBIDVar var = DBIDUtil.newVar();
+ ByteBufferSerializer<?>[] serializers = null;
+ loop: while(true) {
BundleStreamSource.Event ev = source.nextEvent();
- switch(ev) {
+ switch(ev){
case NEXT_OBJECT:
- if (serializers == null) {
+ if(serializers == null) {
serializers = writeHeader(source, buffer, output);
}
- for (int i = 0; i < serializers.length; i++) {
- int size = serializers[i].getByteSize(source.data(i));
+ if(serializers[0] != null) {
+ if(!source.assignDBID(var)) {
+ throw new AbortException("An object did not have an DBID assigned.");
+ }
+ DBID id = DBIDUtil.deref(var);
+ @SuppressWarnings("unchecked")
+ ByteBufferSerializer<DBID> ser = (ByteBufferSerializer<DBID>) serializers[0];
+ int size = ser.getByteSize(id);
buffer = ensureBuffer(size, buffer, output);
- serializers[i].toByteBuffer(buffer, source.data(i));
+ ser.toByteBuffer(buffer, id);
+ }
+ for(int i = 1, j = 0; i < serializers.length; ++i, ++j) {
+ @SuppressWarnings("unchecked")
+ ByteBufferSerializer<Object> ser = (ByteBufferSerializer<Object>) serializers[i];
+ int size = ser.getByteSize(source.data(j));
+ buffer = ensureBuffer(size, buffer, output);
+ ser.toByteBuffer(buffer, source.data(j));
}
break; // switch
case META_CHANGED:
- if (serializers != null) {
+ if(serializers != null) {
throw new AbortException("Meta changes are not supported, once the block header has been written.");
}
break; // switch
@@ -95,7 +113,7 @@ public class BundleWriter {
break; // switch
}
}
- if (buffer.position() > 0) {
+ if(buffer.position() > 0) {
flushBuffer(buffer, output);
}
}
@@ -124,11 +142,11 @@ public class BundleWriter {
* @throws IOException on IO errors
*/
private ByteBuffer ensureBuffer(int size, ByteBuffer buffer, WritableByteChannel output) throws IOException {
- if (buffer.remaining() >= size) {
+ if(buffer.remaining() >= size) {
return buffer;
}
flushBuffer(buffer, output);
- if (buffer.remaining() >= size) {
+ if(buffer.remaining() >= size) {
return buffer;
}
// Aggressively grow the buffer
@@ -144,25 +162,33 @@ public class BundleWriter {
* @return Array of serializers
* @throws IOException on IO errors
*/
- @SuppressWarnings("unchecked")
- private ByteBufferSerializer<Object>[] writeHeader(BundleStreamSource source, ByteBuffer buffer, WritableByteChannel output) throws IOException {
+ private ByteBufferSerializer<?>[] writeHeader(BundleStreamSource source, ByteBuffer buffer, WritableByteChannel output) throws IOException {
final BundleMeta meta = source.getMeta();
final int nummeta = meta.size();
@SuppressWarnings("rawtypes")
- final ByteBufferSerializer[] serializers = new ByteBufferSerializer[nummeta];
+ final ByteBufferSerializer[] serializers = new ByteBufferSerializer[1 + nummeta];
// Write our magic ID first.
assert (buffer.position() == 0) : "Buffer is supposed to be at 0.";
buffer.putInt(MAGIC);
- // Write the number of metas next
- buffer.putInt(nummeta);
- for (int i = 0; i < nummeta; i++) {
+ // Write the number of metas next.
+ // For compatibility with earlier versions, treat DBIDs as extra type
+ if(source.hasDBIDs()) {
+ buffer.putInt(1 + nummeta);
+ ByteBufferSerializer<?> ser = TypeUtil.DBID.getSerializer();
+ TypeInformationSerializer.STATIC.toByteBuffer(buffer, TypeUtil.DBID);
+ serializers[0] = ser;
+ }
+ else {
+ buffer.putInt(nummeta);
+ }
+ for(int i = 0; i < nummeta; i++) {
SimpleTypeInformation<?> type = meta.get(i);
- ByteBufferSerializer<Object> ser = (ByteBufferSerializer<Object>) type.getSerializer();
- if (ser == null) {
+ ByteBufferSerializer<?> ser = type.getSerializer();
+ if(ser == null) {
throw new AbortException("Cannot serialize - no serializer found for type: " + type.toString());
}
TypeInformationSerializer.STATIC.toByteBuffer(buffer, type);
- serializers[i] = ser;
+ serializers[i + 1] = ser;
}
return serializers;
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/MultipleObjectsBundle.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/MultipleObjectsBundle.java
index 37b517dd..4979360c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/MultipleObjectsBundle.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/MultipleObjectsBundle.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,6 +27,11 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -41,6 +46,11 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
*/
public class MultipleObjectsBundle implements ObjectBundle {
/**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(MultipleObjectsBundle.class);
+
+ /**
* Storing the meta data.
*/
private BundleMeta meta;
@@ -51,38 +61,16 @@ public class MultipleObjectsBundle implements ObjectBundle {
private List<List<?>> columns;
/**
- * Constructor.
+ * DBIDs for these objects, but may be null.
*/
- public MultipleObjectsBundle() {
- this.meta = new BundleMeta();
- this.columns = new ArrayList<>();
- }
+ private ArrayDBIDs ids;
/**
* Constructor.
- *
- * @param meta Meta data contained.
- * @param columns Content in columns
*/
- @Deprecated
- public MultipleObjectsBundle(BundleMeta meta, List<List<?>> columns) {
- super();
- this.meta = meta;
- this.columns = columns;
- if(this.columns.size() != this.meta.size()) {
- throw new AbortException("Meta size and columns do not agree!");
- }
- int len = -1;
- for(List<?> col : columns) {
- if(len < 0) {
- len = col.size();
- }
- else {
- if(col.size() != len) {
- throw new AbortException("Column lengths do not agree.");
- }
- }
- }
+ public MultipleObjectsBundle() {
+ this.meta = new BundleMeta();
+ this.columns = new ArrayList<>();
}
@Override
@@ -106,8 +94,18 @@ public class MultipleObjectsBundle implements ObjectBundle {
}
@Override
+ public boolean assignDBID(int onum, DBIDVar var) {
+ if(ids == null) {
+ var.unset();
+ return false;
+ }
+ ids.assignVar(onum, var);
+ return true;
+ }
+
+ @Override
public int dataLength() {
- return (columns.size() == 0) ? 0 : columns.get(0).size();
+ return (ids != null) ? ids.size() : (columns.size() == 0) ? 0 : columns.get(0).size();
}
/**
@@ -141,6 +139,24 @@ public class MultipleObjectsBundle implements ObjectBundle {
}
/**
+ * Set the DBID range for this bundle.
+ *
+ * @param ids DBIDs
+ */
+ public void setDBIDs(ArrayDBIDs ids) {
+ this.ids = ids;
+ }
+
+ /**
+ * Get the DBIDs, may be {@code null}.
+ *
+ * @return DBIDs
+ */
+ public ArrayDBIDs getDBIDs() {
+ return ids;
+ }
+
+ /**
* Get the raw objects columns. Use with caution!
*
* @param i column number
@@ -202,6 +218,15 @@ public class MultipleObjectsBundle implements ObjectBundle {
}
/**
+ * Process this bundle as stream.
+ *
+ * @return Stream
+ */
+ public BundleStreamSource asStream() {
+ return new StreamFromBundle(this);
+ }
+
+ /**
* Convert an object stream to a bundle
*
* @param source Object stream
@@ -210,6 +235,9 @@ public class MultipleObjectsBundle implements ObjectBundle {
public static MultipleObjectsBundle fromStream(BundleStreamSource source) {
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
boolean stop = false;
+ DBIDVar var = null;
+ ArrayModifiableDBIDs ids = null;
+ int size = 0;
while(!stop) {
BundleStreamSource.Event ev = source.nextEvent();
switch(ev){
@@ -227,19 +255,35 @@ public class MultipleObjectsBundle implements ObjectBundle {
List<Object> data = new ArrayList<>(bundle.dataLength() + 1);
bundle.appendColumn(smeta.get(i), data);
}
+ if(var == null && source.hasDBIDs()) {
+ var = DBIDUtil.newVar();
+ ids = DBIDUtil.newArray();
+ }
continue;
case NEXT_OBJECT:
+ if(var != null && source.assignDBID(var)) {
+ ids.add(var);
+ }
for(int i = 0; i < bundle.metaLength(); i++) {
@SuppressWarnings("unchecked")
final List<Object> col = (List<Object>) bundle.columns.get(i);
col.add(source.data(i));
}
+ ++size;
continue;
default:
LoggingUtil.warning("Unknown event: " + ev);
continue;
}
}
+ if(ids != null) {
+ if(size != ids.size()) {
+ LOG.warning("Not every object had an DBID - discarding DBIDs: " + size + " != " + ids.size());
+ }
+ else {
+ bundle.setDBIDs(ids);
+ }
+ }
return bundle;
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/ObjectBundle.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/ObjectBundle.java
index fbf88f3d..8833fb6b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/ObjectBundle.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/ObjectBundle.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,6 +24,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
*/
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
/**
* Abstract interface for object packages.
@@ -72,4 +73,13 @@ public interface ObjectBundle {
* @return Contained data
*/
public Object data(int onum, int rnum);
+
+ /**
+ * Assign the object DBID to a variable
+ *
+ * @param onum Object number
+ * @param var Variable
+ * @return {@code false} if there was no predefined DBID.
+ */
+ public boolean assignDBID(int onum, DBIDVar var);
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/SingleObjectBundle.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/SingleObjectBundle.java
index f0db03f7..8339aed4 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/SingleObjectBundle.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/SingleObjectBundle.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,6 +27,8 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
/**
* This class represents a "packaged" object, which is a transfer container for
@@ -47,6 +49,11 @@ public class SingleObjectBundle implements ObjectBundle {
private List<Object> contents;
/**
+ * Object ID
+ */
+ private DBID id;
+
+ /**
* Constructor.
*/
public SingleObjectBundle() {
@@ -66,6 +73,21 @@ public class SingleObjectBundle implements ObjectBundle {
assert (meta.size() == contents.size());
}
+ /**
+ * Constructor.
+ *
+ * @param meta Metadata
+ * @param id ID of object
+ * @param contents Object values
+ */
+ public SingleObjectBundle(BundleMeta meta, DBID id, List<Object> contents) {
+ super();
+ this.meta = meta;
+ this.id = id;
+ this.contents = contents;
+ assert (meta.size() == contents.size());
+ }
+
@Override
public BundleMeta meta() {
return meta;
@@ -104,6 +126,16 @@ public class SingleObjectBundle implements ObjectBundle {
return contents.get(rnum);
}
+ @Override
+ public boolean assignDBID(int onum, DBIDVar var) {
+ if(id == null) {
+ var.unset();
+ return false;
+ }
+ var.set(id);
+ return true;
+ }
+
/**
* Append a single representation to the object.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/StreamFromBundle.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/StreamFromBundle.java
index de683b30..7a9a68b2 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/StreamFromBundle.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/StreamFromBundle.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,8 +23,12 @@ package de.lmu.ifi.dbs.elki.datasource.bundle;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+
/**
- * Convert a MultipleObjectsBundle to a stream
+ * Convert a MultipleObjectsBundle to a stream.
+ *
+ * To use this, invoke {@link MultipleObjectsBundle#asStream()}.
*
* @author Erich Schubert
*/
@@ -60,6 +64,16 @@ public class StreamFromBundle implements BundleStreamSource {
}
@Override
+ public boolean hasDBIDs() {
+ return bundle.getDBIDs() != null;
+ }
+
+ @Override
+ public boolean assignDBID(DBIDVar var) {
+ return bundle.assignDBID(onum, var);
+ }
+
+ @Override
public Event nextEvent() {
onum += 1;
if(onum < 0) {
@@ -70,4 +84,9 @@ public class StreamFromBundle implements BundleStreamSource {
}
return Event.NEXT_OBJECT;
}
+
+ @Override
+ public MultipleObjectsBundle asMultipleObjectsBundle() {
+ return bundle;
+ }
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/bundle/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/bundle/package-info.java
index 8834e40b..6de7a929 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/bundle/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/bundle/package-info.java
@@ -7,7 +7,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
index 1cb68b30..8bedbcc3 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -58,6 +58,7 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
}
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+ final Logging logger = getLogger();
for(int r = 0; r < objects.metaLength(); r++) {
@SuppressWarnings("unchecked")
SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) objects.meta(r);
@@ -73,18 +74,14 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
// When necessary, perform an initialization scan
if(prepareStart(castType)) {
- FiniteProgress pprog = getLogger().isVerbose() ? new FiniteProgress("Preparing normalization.", objects.dataLength(), getLogger()) : null;
+ FiniteProgress pprog = logger.isVerbose() ? new FiniteProgress("Preparing normalization.", objects.dataLength(), logger) : null;
for(Object o : column) {
@SuppressWarnings("unchecked")
final I obj = (I) o;
prepareProcessInstance(obj);
- if (pprog != null) {
- pprog.incrementProcessed(getLogger());
- }
- }
- if (pprog != null) {
- pprog.ensureCompleted(getLogger());
+ logger.incrementProcessed(pprog);
}
+ logger.ensureCompleted(pprog);
prepareComplete();
}
@@ -93,19 +90,15 @@ public abstract class AbstractConversionFilter<I, O> implements ObjectFilter {
bundle.appendColumn(convertedType(castType), castColumn);
// Normalization scan
- FiniteProgress nprog = getLogger().isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), getLogger()) : null;
+ FiniteProgress nprog = logger.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), logger) : null;
for(int i = 0; i < objects.dataLength(); i++) {
@SuppressWarnings("unchecked")
final I obj = (I) column.get(i);
final O normalizedObj = filterSingleObject(obj);
castColumn.set(i, normalizedObj);
- if (nprog != null) {
- nprog.incrementProcessed(getLogger());
- }
- }
- if (nprog != null) {
- nprog.ensureCompleted(getLogger());
+ logger.incrementProcessed(nprog);
}
+ logger.ensureCompleted(nprog);
}
return bundle;
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
index 5b48a8c0..0d4b5f8d 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
index 6a210db3..dca3d221 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractStreamFilter.java
@@ -1,14 +1,10 @@
package de.lmu.ifi.dbs.elki.datasource.filter;
-import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +22,10 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.StreamFromBundle;
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+
/**
* Abstract base class for streaming filters.
*
@@ -39,12 +39,27 @@ public abstract class AbstractStreamFilter implements StreamFilter {
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- init(new StreamFromBundle(objects));
- return MultipleObjectsBundle.fromStream(this);
+ return init(objects.asStream()).asMultipleObjectsBundle();
}
@Override
- public void init(BundleStreamSource source) {
+ public BundleStreamSource init(BundleStreamSource source) {
this.source = source;
+ return this;
+ }
+
+ @Override
+ public boolean hasDBIDs() {
+ return source.hasDBIDs();
+ }
+
+ @Override
+ public boolean assignDBID(DBIDVar var) {
+ return source.assignDBID(var);
+ }
+
+ @Override
+ public MultipleObjectsBundle asMultipleObjectsBundle() {
+ return MultipleObjectsBundle.fromStream(this);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java
index b9305aa6..c565a36c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -34,11 +34,11 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
* @param <I> Input vector type
* @param <O> Output vector type
*/
-public abstract class AbstractVectorConversionFilter<I, O extends NumberVector<?>> extends AbstractConversionFilter<I, O> {
+public abstract class AbstractVectorConversionFilter<I, O extends NumberVector> extends AbstractConversionFilter<I, O> {
/**
* Number vector factory.
*/
- protected NumberVector.Factory<O, ?> factory;
+ protected NumberVector.Factory<O> factory;
/**
* Initialize factory from a data type.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java
index 6a15c41c..b9c337bc 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/AbstractVectorStreamConversionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,11 +33,11 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
* @param <I> Input type
* @param <O> Output vector type
*/
-public abstract class AbstractVectorStreamConversionFilter<I, O extends NumberVector<?>> extends AbstractStreamConversionFilter<I, O> {
+public abstract class AbstractVectorStreamConversionFilter<I, O extends NumberVector> extends AbstractStreamConversionFilter<I, O> {
/**
* Number vector factory.
*/
- protected NumberVector.Factory<O, ?> factory;
+ protected NumberVector.Factory<O> factory;
/**
* Initialize factory from a data type.
@@ -47,5 +47,4 @@ public abstract class AbstractVectorStreamConversionFilter<I, O extends NumberVe
protected void initializeOutputType(SimpleTypeInformation<O> type) {
factory = FilterUtil.guessFactory(type);
}
-
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java
index 9ef9d34f..8873c9a1 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/FilterUtil.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,7 @@ import java.lang.reflect.Field;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
-import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
/**
@@ -51,16 +51,16 @@ public final class FilterUtil {
* @return Factory
*/
@SuppressWarnings("unchecked")
- public static <V extends NumberVector<?>> NumberVector.Factory<V, ?> guessFactory(SimpleTypeInformation<V> in) {
- NumberVector.Factory<V, ?> factory = null;
- if(in instanceof VectorFieldTypeInformation) {
- factory = (NumberVector.Factory<V, ?>) ((VectorFieldTypeInformation<V>) in).getFactory();
+ public static <V extends NumberVector> NumberVector.Factory<V> guessFactory(SimpleTypeInformation<V> in) {
+ NumberVector.Factory<V> factory = null;
+ if(in instanceof VectorTypeInformation) {
+ factory = (NumberVector.Factory<V> ) ((VectorTypeInformation<V>) in).getFactory();
}
if(factory == null) {
// FIXME: hack. Add factories to simple type information, too?
try {
Field f = in.getRestrictionClass().getField("FACTORY");
- factory = (NumberVector.Factory<V, ?>) f.get(null);
+ factory = (NumberVector.Factory<V> ) f.get(null);
}
catch(Exception e) {
LoggingUtil.warning("Cannot determine factory for type " + in.getRestrictionClass(), e);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
index ce02fc29..3c66ad85 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,12 +23,13 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDFactory;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -40,7 +41,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*
* @apiviz.has DBID oneway - - «produces»
*/
-public class FixedDBIDsFilter extends AbstractStreamFilter {
+public class FixedDBIDsFilter implements ObjectFilter {
/**
* The filtered meta
*/
@@ -62,35 +63,11 @@ public class FixedDBIDsFilter extends AbstractStreamFilter {
}
@Override
- public BundleMeta getMeta() {
- return meta;
- }
-
- @Override
- public Event nextEvent() {
- Event ev = source.nextEvent();
- if(ev == Event.META_CHANGED) {
- if(meta == null) {
- meta = new BundleMeta();
- meta.add(TypeUtil.DBID);
- }
- BundleMeta origmeta = source.getMeta();
- // Note -1 for the injected DBID column
- for(int i = meta.size() - 1; i < origmeta.size(); i++) {
- meta.add(origmeta.get(i));
- }
- }
- return ev;
- }
-
- @Override
- public Object data(int rnum) {
- if(rnum == 0) {
- DBID ret = DBIDUtil.importInteger(curid);
- curid++;
- return ret;
- }
- return source.data(rnum - 1);
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ DBIDRange ids = DBIDFactory.FACTORY.generateStaticDBIDRange(curid, objects.dataLength());
+ objects.setDBIDs(ids);
+ curid += objects.dataLength();
+ return objects;
}
/**
@@ -108,19 +85,24 @@ public class FixedDBIDsFilter extends AbstractStreamFilter {
* </p>
*/
public static final OptionID IDSTART_ID = new OptionID("dbc.startid", "Object ID to start counting with");
- int startid = -1;
+
+ /**
+ * First ID to use.
+ */
+ int startid = 0;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter startidParam = new IntParameter(IDSTART_ID);
+ IntParameter startidParam = new IntParameter(IDSTART_ID, 0) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(startidParam)) {
- startid = startidParam.getValue().intValue();
+ startid = startidParam.intValue();
}
}
@Override
- protected Object makeInstance() {
+ protected FixedDBIDsFilter makeInstance() {
return new FixedDBIDsFilter(startid);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
index ce758763..09896f15 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/NoOpFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
index 5073cea8..a6c364aa 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ObjectFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
index 798cd05d..45464f31 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/StreamFilter.java
@@ -1,12 +1,10 @@
package de.lmu.ifi.dbs.elki.datasource.filter;
-import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,9 +23,11 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+
/**
- * Streaming filters are often more efficient (less memory use) and can be used
- * in more settings.
+ * Streaming filters are often more efficient (less memory use) as they do not
+ * keep a reference to earlier data.
*
* @author Erich Schubert
*
@@ -41,5 +41,5 @@ public interface StreamFilter extends ObjectFilter, BundleStreamSource {
*
* @param source Stream source
*/
- public void init(BundleStreamSource source);
+ public BundleStreamSource init(BundleStreamSource source);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/DropNaNFilter.java
index fb9cf83e..4c226085 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/DropNaNFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -30,7 +30,9 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -43,6 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.DropNaNFilter" })
public class DropNaNFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -73,33 +76,33 @@ public class DropNaNFilter extends AbstractStreamFilter {
@Override
public Event nextEvent() {
- while (true) {
+ while(true) {
Event ev = source.nextEvent();
- switch(ev) {
+ switch(ev){
case END_OF_STREAM:
return ev;
case META_CHANGED:
updateMeta(source.getMeta());
return ev;
case NEXT_OBJECT:
- if (densecols == null) {
+ if(densecols == null) {
updateMeta(source.getMeta());
}
boolean good = true;
- for (int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
- NumberVector<?> v = (NumberVector<?>) source.data(j);
- if (v == null) {
+ for(int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
+ NumberVector v = (NumberVector) source.data(j);
+ if(v == null) {
good = false;
break;
}
- for (int i = 0; i < v.getDimensionality(); i++) {
- if (Double.isNaN(v.doubleValue(i))) {
+ for(int i = 0; i < v.getDimensionality(); i++) {
+ if(Double.isNaN(v.doubleValue(i))) {
good = false;
break;
}
}
}
- if (good) {
+ if(good) {
return ev;
}
continue;
@@ -114,21 +117,22 @@ public class DropNaNFilter extends AbstractStreamFilter {
*/
private void updateMeta(BundleMeta meta) {
int cols = meta.size();
- if (densecols == null) {
+ if(densecols == null) {
densecols = new BitSet();
- } else {
+ }
+ else {
densecols.clear();
}
- for (int i = 0; i < cols; i++) {
- if (TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
+ for(int i = 0; i < cols; i++) {
+ if(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
throw new AbortException("Filtering sparse vectors is not yet supported by this filter. Please contribute.");
}
// TODO: only check for double and float?
- if (TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
densecols.set(i);
continue;
}
- if (TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
densecols.set(i);
continue;
}
@@ -137,32 +141,32 @@ public class DropNaNFilter extends AbstractStreamFilter {
@Override
public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if (LOG.isDebuggingFinest()) {
+ if(LOG.isDebuggingFinest()) {
LOG.debugFinest("Removing records with NaN values.");
}
updateMeta(objects.meta());
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
bundle.appendColumn(objects.meta(j), new ArrayList<>());
}
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final Object[] row = objects.getRow(i);
boolean good = true;
- for (int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
- NumberVector<?> v = (NumberVector<?>) row[j];
- if (v == null) {
+ for(int j = densecols.nextSetBit(0); j >= 0; j = densecols.nextSetBit(j + 1)) {
+ NumberVector v = (NumberVector) row[j];
+ if(v == null) {
good = false;
break;
}
- for (int d = 0; d < v.getDimensionality(); d++) {
- if (Double.isNaN(v.doubleValue(d))) {
+ for(int d = 0; d < v.getDimensionality(); d++) {
+ if(Double.isNaN(v.doubleValue(d))) {
good = false;
break;
}
}
}
- if (good) {
+ if(good) {
bundle.appendSimple(row);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/NoMissingValuesFilter.java
index b3f0af53..9b7ab977 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NoMissingValuesFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/NoMissingValuesFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,9 @@ import java.util.ArrayList;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
@@ -35,6 +37,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.NoMissingValuesFilter" })
public class NoMissingValuesFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -123,7 +126,7 @@ public class NoMissingValuesFilter extends AbstractStreamFilter {
*/
public static class Parameterizer extends AbstractParameterizer {
@Override
- protected Object makeInstance() {
+ protected NoMissingValuesFilter makeInstance() {
return new NoMissingValuesFilter();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/ReplaceNaNWithRandomFilter.java
index 9029d8ea..96a5f059 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/ReplaceNaNWithRandomFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -30,8 +30,10 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -47,6 +49,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ReplaceNaNWithRandomFilter" })
public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -56,7 +59,7 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
/**
* Columns to check.
*/
- private NumberVector.Factory<?, ?>[] densecols = null;
+ private NumberVector.Factory<?>[] densecols = null;
/**
* Distribution to generate replacement values with.
@@ -88,28 +91,28 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
@Override
public Event nextEvent() {
- while (true) {
+ while(true) {
Event ev = source.nextEvent();
- switch(ev) {
+ switch(ev){
case END_OF_STREAM:
return ev;
case META_CHANGED:
updateMeta(source.getMeta());
return ev;
case NEXT_OBJECT:
- if (densecols == null) {
+ if(densecols == null) {
updateMeta(source.getMeta());
}
rows.clear();
- for (int j = 0; j < densecols.length; j++) {
+ for(int j = 0; j < densecols.length; j++) {
Object o = source.data(j);
- if (densecols[j] != null) {
- NumberVector<?> v = (NumberVector<?>) o;
+ if(densecols[j] != null) {
+ NumberVector v = (NumberVector) o;
double[] ro = null; // replacement
- if (v != null) {
- for (int i = 0; i < v.getDimensionality(); i++) {
- if (Double.isNaN(v.doubleValue(i))) {
- if (ro != null) {
+ if(v != null) {
+ for(int i = 0; i < v.getDimensionality(); i++) {
+ if(Double.isNaN(v.doubleValue(i))) {
+ if(ro != null) {
ro = v.getColumnVector().getArrayRef();
}
ro[i] = dist.nextRandom();
@@ -132,19 +135,19 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
*/
private void updateMeta(BundleMeta meta) {
final int cols = meta.size();
- densecols = new NumberVector.Factory<?, ?>[cols];
- for (int i = 0; i < cols; i++) {
- if (TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
+ densecols = new NumberVector.Factory<?>[cols];
+ for(int i = 0; i < cols; i++) {
+ if(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) {
throw new AbortException("Filtering sparse vectors is not yet supported by this filter. Please contribute.");
}
- if (TypeUtil.FLOAT_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.FLOAT_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
VectorFieldTypeInformation<?> vmeta = (VectorFieldTypeInformation<?>) meta.get(i);
- densecols[i] = (NumberVector.Factory<?, ?>) vmeta.getFactory();
+ densecols[i] = (NumberVector.Factory<?>) vmeta.getFactory();
continue;
}
- if (TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
+ if(TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) {
VectorFieldTypeInformation<?> vmeta = (VectorFieldTypeInformation<?>) meta.get(i);
- densecols[i] = (NumberVector.Factory<?, ?>) vmeta.getFactory();
+ densecols[i] = (NumberVector.Factory<?>) vmeta.getFactory();
continue;
}
}
@@ -152,25 +155,25 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
@Override
public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if (LOG.isDebuggingFinest()) {
+ if(LOG.isDebuggingFinest()) {
LOG.debugFinest("Removing records with NaN values.");
}
updateMeta(objects.meta());
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
bundle.appendColumn(objects.meta(j), new ArrayList<>());
}
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final Object[] row = objects.getRow(i);
- for (int j = 0; j < densecols.length; j++) {
- if (densecols[j] != null) {
- NumberVector<?> v = (NumberVector<?>) row[j];
+ for(int j = 0; j < densecols.length; j++) {
+ if(densecols[j] != null) {
+ NumberVector v = (NumberVector) row[j];
double[] ro = null; // replacement
- if (v != null) {
- for (int d = 0; d < v.getDimensionality(); d++) {
- if (Double.isNaN(v.doubleValue(d))) {
- if (ro != null) {
+ if(v != null) {
+ for(int d = 0; d < v.getDimensionality(); d++) {
+ if(Double.isNaN(v.doubleValue(d))) {
+ if(ro != null) {
ro = v.getColumnVector().getArrayRef();
}
ro[d] = dist.nextRandom();
@@ -207,7 +210,7 @@ public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<Distribution> distP = new ObjectParameter<>(REPLACEMENT_DISTRIBUTION, Distribution.class);
- if (config.grab(distP)) {
+ if(config.grab(distP)) {
dist = distP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java
new file mode 100644
index 00000000..5d4e2d2a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/VectorDimensionalityFilter.java
@@ -0,0 +1,219 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * Filter to remove all vectors that do not have the desired dimensionality.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.VectorDimensionalityFilter" })
+public class VectorDimensionalityFilter<V extends NumberVector> extends AbstractStreamFilter {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(VectorDimensionalityFilter.class);
+
+ /**
+ * The filtered meta.
+ */
+ BundleMeta meta;
+
+ /**
+ * The column to filter.
+ */
+ int column = -1;
+
+ /**
+ * Desired dimensionality.
+ */
+ int dim = -1;
+
+ /**
+ * Constructor.
+ *
+ * @param dim Dimensionality to enforce (use -1 to use the dimensionality of
+ * the first vector in the data set)
+ */
+ public VectorDimensionalityFilter(int dim) {
+ super();
+ this.dim = dim;
+ }
+
+ @Override
+ public BundleMeta getMeta() {
+ if(meta == null) {
+ updateMeta();
+ }
+ return source.getMeta();
+ }
+
+ @Override
+ public Object data(int rnum) {
+ return source.data(rnum);
+ }
+
+ @Override
+ public Event nextEvent() {
+ while(true) {
+ Event ev = source.nextEvent();
+ switch(ev){
+ case END_OF_STREAM:
+ return ev;
+ case META_CHANGED:
+ meta = null;
+ return ev;
+ case NEXT_OBJECT:
+ if(meta == null) {
+ updateMeta();
+ }
+ if(column >= 0 && dim >= 0) {
+ @SuppressWarnings("unchecked")
+ V vec = (V) source.data(column);
+ if(vec == null) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Skipping null vector.");
+ }
+ continue;
+ }
+ if(vec.getDimensionality() != dim) {
+ if(LOG.isVeryVerbose()) {
+ StringBuilder buf = new StringBuilder();
+ buf.append("Skipping vector of wrong dimensionality ");
+ buf.append(vec.getDimensionality());
+ buf.append(':');
+ for(int i = 0; i < meta.size(); i++) {
+ buf.append(' ');
+ buf.append(source.data(i));
+ }
+ LOG.veryverbose(buf.toString());
+ }
+ continue;
+ }
+ }
+ return ev;
+ }
+ }
+ }
+
+ /**
+ * Update metadata.
+ */
+ private void updateMeta() {
+ meta = new BundleMeta();
+ BundleMeta origmeta = source.getMeta();
+ for(int i = 0; i < origmeta.size(); i++) {
+ SimpleTypeInformation<?> type = origmeta.get(i);
+ if(column < 0) {
+ // Test whether this type matches
+ if(TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(type)) {
+ if(type instanceof VectorFieldTypeInformation) {
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+ if(dim != -1 && castType.mindim() > dim) {
+ throw new AbortException("Would filter all vectors: minimum dimensionality " + castType.mindim() + " > desired dimensionality " + dim);
+ }
+ if(dim != -1 && castType.maxdim() < dim) {
+ throw new AbortException("Would filter all vectors: maximum dimensionality " + castType.maxdim() + " < desired dimensionality " + dim);
+ }
+ if(dim == -1) {
+ dim = castType.mindim();
+ }
+ if(castType.mindim() == castType.maxdim()) {
+ meta.add(castType);
+ column = i;
+ continue;
+ }
+ }
+ @SuppressWarnings("unchecked")
+ final VectorTypeInformation<V> castType = (VectorTypeInformation<V>) type;
+ if(dim != -1) {
+ meta.add(new VectorFieldTypeInformation<>(FilterUtil.guessFactory(castType), dim, dim, castType.getSerializer()));
+ }
+ else {
+ LOG.warning("No dimensionality yet for column " + i);
+ meta.add(castType);
+ }
+ column = i;
+ continue;
+ }
+ }
+ meta.add(type);
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for specifying the dimensionality.
+ */
+ private static final OptionID DIM_P = new OptionID("filter.dim", "Dimensionality of vectors to retain.");
+
+ /**
+ * Desired dimensionality.
+ */
+ int dim = -1;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter dimP = new IntParameter(DIM_P)//
+ .setOptional(true)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ dim = config.grab(dimP) ? dimP.intValue() : -1;
+ }
+
+ @Override
+ protected VectorDimensionalityFilter<V> makeInstance() {
+ return new VectorDimensionalityFilter<>(dim);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java
new file mode 100644
index 00000000..b2d47e69
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/cleaning/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Filters for data cleaning.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.cleaning; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
index 0b4d7ae0..8ad13355 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractNormalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,9 +33,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
*
* @author Elke Achtert
*
- * @param <O> Object type processed
+ * @param <V> Object type processed
*/
-public abstract class AbstractNormalization<O extends NumberVector<?>> extends AbstractVectorConversionFilter<O, O> implements Normalization<O> {
+public abstract class AbstractNormalization<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> implements Normalization<V> {
/**
* Initializes the option handler and the parameter map.
*/
@@ -44,12 +44,18 @@ public abstract class AbstractNormalization<O extends NumberVector<?>> extends A
}
@Override
- protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<O> in) {
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
initializeOutputType(in);
return in;
}
@Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ // FIXME: implement everywhere.
+ throw new UnsupportedOperationException("Not implemented yet.");
+ }
+
+ @Override
public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
// FIXME: implement.
throw new UnsupportedOperationException("Not yet implemented!");
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
index 54fc7794..38e0bf31 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AbstractStreamNormalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,9 +33,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
*
* @author Erich Schubert
*
- * @param <O> Object type processed
+ * @param <V> Object type processed
*/
-public abstract class AbstractStreamNormalization<O extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<O, O> implements Normalization<O> {
+public abstract class AbstractStreamNormalization<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> implements Normalization<V> {
/**
* Initializes the option handler and the parameter map.
*/
@@ -44,12 +44,17 @@ public abstract class AbstractStreamNormalization<O extends NumberVector<?>> ext
}
@Override
- protected SimpleTypeInformation<? super O> convertedType(SimpleTypeInformation<O> in) {
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
initializeOutputType(in);
return in;
}
@Override
+ public V restore(V featureVector) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
// FIXME: implement.
throw new UnsupportedOperationException("Not yet implemented!");
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
index 0abaac95..d9002c93 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/NonNumericFeaturesException.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
index 3c3e7bdf..bf913852 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/Normalization.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
/**
* Normalization performs a normalization on a set of feature vectors and is
@@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
*
* @param <O> object type
*/
-public interface Normalization<O> extends ObjectFilter, Parameterizable {
+public interface Normalization<O> extends ObjectFilter {
/**
* Transforms a feature vector to the original attribute ranges.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
deleted file mode 100644
index 09b73aa4..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import gnu.trove.map.hash.TIntDoubleHashMap;
-import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-
-/**
- * Perform full TF-IDF Normalization as commonly used in text mining.
- *
- * Each record is first normalized using "term frequencies" to sum up to 1. Then
- * it is globally normalized using the Inverse Document Frequency, so rare terms
- * are weighted stronger than common terms.
- *
- * Restore will only undo the IDF part of the normalization!
- *
- * @author Erich Schubert
- *
- * @param <V> Vector type
- */
-public class TFIDFNormalization<V extends SparseNumberVector<?>> extends InverseDocumentFrequencyNormalization<V> {
- /**
- * Class logger.
- */
- private static final Logging LOG = Logging.getLogger(TFIDFNormalization.class);
-
- /**
- * Constructor.
- */
- public TFIDFNormalization() {
- super();
- }
-
- @Override
- protected V filterSingleObject(V featureVector) {
- double sum = 0.0;
- for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) {
- sum += featureVector.iterDoubleValue(it);
- }
- if(sum <= 0) {
- sum = 1.0;
- }
- TIntDoubleHashMap vals = new TIntDoubleHashMap();
- for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) {
- final int dim = featureVector.iterDim(it);
- vals.put(dim, featureVector.iterDoubleValue(it) / sum * idf.get(dim));
- }
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java
new file mode 100644
index 00000000..a1618b9f
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseBetaNormalization.java
@@ -0,0 +1,326 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.BetaDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta.BestFitEstimator;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParameter;
+
+/**
+ * Project the data using a Beta distribution.
+ *
+ * This is a crude heuristic, that may or may not work for your data set. There
+ * currently is no theoretical foundation of why it may be sensible or not to do
+ * this.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ * @apiviz.uses DistributionEstimator
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseBetaNormalization"})
+public class AttributeWiseBetaNormalization<V extends NumberVector> implements Normalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseBetaNormalization.class);
+
+ /**
+ * Stores the distribution estimators
+ */
+ private List<DistributionEstimator<?>> estimators;
+
+ /**
+ * Stores the estimated distributions
+ */
+ private List<Distribution> dists;
+
+ /**
+ * Number vector factory.
+ */
+ protected NumberVector.Factory<V> factory;
+
+ /**
+ * Expected outlier rate alpha.
+ */
+ protected double alpha = 0.01;
+
+ /**
+ * Constructor.
+ *
+ * @param estimators Distribution estimators
+ */
+ public AttributeWiseBetaNormalization(List<DistributionEstimator<?>> estimators, double alpha) {
+ super();
+ this.estimators = estimators;
+ this.alpha = alpha;
+ }
+
+ @Override
+ public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
+ if(objects.dataLength() == 0) {
+ return objects;
+ }
+ for(int r = 0; r < objects.metaLength(); r++) {
+ SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
+ final List<?> column = (List<?>) objects.getColumn(r);
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ continue;
+ }
+ @SuppressWarnings("unchecked")
+ final List<V> castColumn = (List<V>) column;
+ // Get the replacement type information
+ @SuppressWarnings("unchecked")
+ final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
+ factory = FilterUtil.guessFactory(castType);
+
+ // Scan to find the best
+ final int dim = castType.getDimensionality();
+ dists = new ArrayList<>(dim);
+ // Scratch space for testing:
+ double[] test = new double[castColumn.size()];
+
+ // We iterate over dimensions, this kind of filter needs fast random
+ // access.
+ Adapter adapter = new Adapter();
+ for(int d = 0; d < dim; d++) {
+ adapter.dim = d;
+ if(estimators.size() == 1) {
+ dists.add(estimators.get(0).estimate(castColumn, adapter));
+ continue;
+ }
+ Distribution best = null;
+ double bestq = Double.POSITIVE_INFINITY;
+ trials: for(DistributionEstimator<?> est : estimators) {
+ try {
+ Distribution dist = est.estimate(castColumn, adapter);
+ for(int i = 0; i < test.length; i++) {
+ test[i] = dist.cdf(castColumn.get(i).doubleValue(d));
+ if(Double.isNaN(test[i])) {
+ LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ if(Double.isInfinite(test[i])) {
+ LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ }
+ Arrays.sort(test);
+ double q = KolmogorovSmirnovTest.simpleTest(test);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
+ }
+ if(best == null || q < bestq) {
+ best = dist;
+ bestq = q;
+ }
+ }
+ catch(ArithmeticException e) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
+ }
+ continue;
+ }
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
+ }
+ dists.add(best);
+ }
+
+ // Beta distribution for projection
+ double p = Math.pow(alpha, -1 / Math.sqrt(dim));
+ BetaDistribution beta = new BetaDistribution(p, p);
+ // Normalization scan
+ double[] buf = new double[dim];
+ for(int i = 0; i < objects.dataLength(); i++) {
+ final V obj = castColumn.get(i);
+ for(int d = 0; d < dim; d++) {
+ // TODO: when available, use logspace for better numerical precision!
+ buf[d] = beta.quantile(dists.get(d).cdf(obj.doubleValue(d)));
+ }
+ castColumn.set(i, factory.newNumberVector(buf));
+ }
+ }
+ return objects;
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization distributions: ");
+ boolean first = true;
+ for(DistributionEstimator<?> est : estimators) {
+ if(!first) {
+ result.append(',');
+ }
+ first = false;
+ result.append(est.getClass().getSimpleName());
+ }
+ return result.toString();
+ }
+
+ /**
+ * Array adapter class for vectors.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector>> {
+ /**
+ * Dimension to process.
+ */
+ int dim;
+
+ @Override
+ public int size(List<? extends NumberVector> array) {
+ return array.size();
+ }
+
+ @Override
+ public Double get(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return getDouble(array, off);
+ }
+
+ @Override
+ public double getDouble(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).doubleValue(dim);
+ }
+
+ @Override
+ public float getFloat(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).floatValue(dim);
+ }
+
+ @Override
+ public int getInteger(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).intValue(dim);
+ }
+
+ @Override
+ public short getShort(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).shortValue(dim);
+ }
+
+ @Override
+ public long getLong(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).longValue(dim);
+ }
+
+ @Override
+ public byte getByte(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
+ return array.get(off).byteValue(dim);
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for distribution estimators.
+ */
+ public static final OptionID DISTRIBUTIONS_ID = new OptionID("betanormalize.distributions", "A list of the distribution estimators to try.");
+
+ /**
+ * Shape parameter.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("betanormalize.alpha", "Alpha parameter to control the shape of the output distribution.");
+
+ /**
+ * Stores the distribution estimators
+ */
+ private List<DistributionEstimator<?>> estimators;
+
+ /**
+ * Expected outlier rate alpha.
+ */
+ private double alpha;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectListParameter<DistributionEstimator<?>> estP = new ObjectListParameter<>(DISTRIBUTIONS_ID, DistributionEstimator.class);
+ List<Class<? extends DistributionEstimator<?>>> def = new ArrayList<>(1);
+ def.add(BestFitEstimator.class);
+ estP.setDefaultValue(def);
+ if(config.grab(estP)) {
+ estimators = estP.instantiateClasses(config);
+ }
+
+ DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.1);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+ }
+
+ @Override
+ protected AttributeWiseBetaNormalization<V> makeInstance() {
+ return new AttributeWiseBetaNormalization<>(estimators, alpha);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java
index dd86cc5a..be501b11 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseCDFNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,12 +33,16 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.meta.BestFitEstimator;
import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -66,7 +70,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectListParamet
* @apiviz.uses DistributionEstimator
*/
// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements Normalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseCDFNormalization"})
+public class AttributeWiseCDFNormalization<V extends NumberVector> implements Normalization<V> {
/**
* Class logger.
*/
@@ -85,7 +90,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
/**
* Number vector factory.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
/**
* Constructor.
@@ -99,13 +104,13 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (objects.dataLength() == 0) {
+ if(objects.dataLength() == 0) {
return objects;
}
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked")
@@ -119,60 +124,33 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
final int dim = castType.getDimensionality();
dists = new ArrayList<>(dim);
// Scratch space for testing:
- double[] test = new double[castColumn.size()];
+ double[] test = estimators.size() > 1 ? new double[castColumn.size()] : null;
// We iterate over dimensions, this kind of filter needs fast random
// access.
Adapter adapter = new Adapter();
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
adapter.dim = d;
- if (estimators.size() == 1) {
- dists.add(estimators.get(0).estimate(castColumn, adapter));
- } else {
- Distribution best = null;
- double bestq = Double.POSITIVE_INFINITY;
- trials: for (DistributionEstimator<?> est : estimators) {
- try {
- Distribution dist = est.estimate(castColumn, adapter);
- for (int i = 0; i < test.length; i++) {
- test[i] = dist.cdf(castColumn.get(i).doubleValue(d));
- if (Double.isNaN(test[i])) {
- LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
- continue trials;
- }
- if (Double.isInfinite(test[i])) {
- LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
- continue trials;
- }
- }
- Arrays.sort(test);
- double q = KolmogorovSmirnovTest.simpleTest(test);
- if (LOG.isVeryVerbose()) {
- LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
- }
- if (best == null || q < bestq) {
- best = dist;
- bestq = q;
- }
- } catch (ArithmeticException e) {
- if (LOG.isVeryVerbose()) {
- LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
- }
- continue;
- }
- }
- if (LOG.isVerbose()) {
- LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
- }
- dists.add(best);
+ Distribution dist;
+ if(estimators.size() == 1) {
+ dist = estimators.get(0).estimate(castColumn, adapter);
+ }
+ else {
+ dist = findBestFit(castColumn, adapter, d, test);
+ }
+ // Special handling for constant distributions:
+ // We want them to remain 0, instead of - usually - becoming constant .5
+ if(dist instanceof UniformDistribution) {
+ dist = constantZero(castColumn, adapter) ? new UniformDistribution(0., 1.) : dist;
}
+ dists.add(dist);
}
// Normalization scan
double[] buf = new double[dim];
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
buf[d] = dists.get(d).cdf(obj.doubleValue(d));
}
castColumn.set(i, factory.newNumberVector(buf));
@@ -181,6 +159,71 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
return objects;
}
+ /**
+ * Find the best fitting distribution.
+ *
+ * @param col Column of table
+ * @param adapter Adapter for accessing the data
+ * @param d Dimension
+ * @param test Scatch space for testing goodness of fit
+ * @return Best fit distribution
+ */
+ protected Distribution findBestFit(final List<V> col, Adapter adapter, int d, double[] test) {
+ Distribution best = null;
+ double bestq = Double.POSITIVE_INFINITY;
+ trials: for(DistributionEstimator<?> est : estimators) {
+ try {
+ Distribution dist = est.estimate(col, adapter);
+ for(int i = 0; i < test.length; i++) {
+ test[i] = dist.cdf(col.get(i).doubleValue(d));
+ if(Double.isNaN(test[i])) {
+ LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ if(Double.isInfinite(test[i])) {
+ LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
+ continue trials;
+ }
+ }
+ Arrays.sort(test);
+ double q = KolmogorovSmirnovTest.simpleTest(test);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
+ }
+ if(best == null || q < bestq) {
+ best = dist;
+ bestq = q;
+ }
+ }
+ catch(ArithmeticException e) {
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
+ }
+ continue trials;
+ }
+ }
+ if(LOG.isVerbose()) {
+ LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
+ }
+ return best;
+ }
+
+ /**
+ * Test if an attribute is constant zero.
+ *
+ * @param column Column
+ * @param adapter Data accessor.
+ * @return {@code true} if all values are zero
+ */
+ protected boolean constantZero(List<V> column, Adapter adapter) {
+ for(int i = 0, s = adapter.size(column); i < s; i++) {
+ if(adapter.get(column, i) != 0.) {
+ return false;
+ }
+ }
+ return true;
+ }
+
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
throw new UnsupportedOperationException(ExceptionMessages.UNSUPPORTED_NOT_YET);
@@ -198,8 +241,8 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
result.append('\n');
result.append("normalization distributions: ");
boolean first = true;
- for (DistributionEstimator<?> est : estimators) {
- if (!first) {
+ for(DistributionEstimator<?> est : estimators) {
+ if(!first) {
result.append(',');
}
first = false;
@@ -212,52 +255,52 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
* Array adapter class for vectors.
*
* @author Erich Schubert
- *
+ *
* @apiviz.exclude
*/
- private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector<?>>> {
+ private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector>> {
/**
* Dimension to process.
*/
int dim;
@Override
- public int size(List<? extends NumberVector<?>> array) {
+ public int size(List<? extends NumberVector> array) {
return array.size();
}
@Override
- public Double get(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public Double get(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return getDouble(array, off);
}
@Override
- public double getDouble(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public double getDouble(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).doubleValue(dim);
}
@Override
- public float getFloat(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public float getFloat(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).floatValue(dim);
}
@Override
- public int getInteger(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public int getInteger(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).intValue(dim);
}
@Override
- public short getShort(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public short getShort(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).shortValue(dim);
}
@Override
- public long getLong(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public long getLong(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).longValue(dim);
}
@Override
- public byte getByte(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException {
+ public byte getByte(List<? extends NumberVector> array, int off) throws IndexOutOfBoundsException {
return array.get(off).byteValue(dim);
}
}
@@ -269,7 +312,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Parameter for distribution estimators.
*/
@@ -287,7 +330,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements
List<Class<? extends DistributionEstimator<?>>> def = new ArrayList<>(1);
def.add(BestFitEstimator.class);
estP.setDefaultValue(def);
- if (config.grab(estP)) {
+ if(config.grab(estP)) {
estimators = estP.instantiateClasses(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java
index 9a263171..e4af3a92 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseErfNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseErfNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
* Attribute-wise Normalization using the error function. This mostly makes
@@ -35,11 +37,12 @@ import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
*
* @author Erich Schubert
*
- * @param <O> Object type
+ * @param <V> Object type
*
* @apiviz.uses NumberVector
*/
-public class AttributeWiseErfNormalization<O extends NumberVector<?>> extends AbstractNormalization<O> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseErfNormalization"})
+public class AttributeWiseErfNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
@@ -53,26 +56,21 @@ public class AttributeWiseErfNormalization<O extends NumberVector<?>> extends Ab
}
@Override
- public O restore(O featureVector) {
- throw new UnsupportedOperationException("Not implemented yet.");
- }
-
- @Override
- protected O filterSingleObject(O obj) {
+ protected V filterSingleObject(V obj) {
double[] val = new double[obj.getDimensionality()];
- for (int i = 0; i < val.length; i++) {
+ for(int i = 0; i < val.length; i++) {
val[i] = NormalDistribution.erf(obj.doubleValue(i));
}
return factory.newNumberVector(val);
}
@Override
- protected SimpleTypeInformation<? super O> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ protected Logging getLogger() {
+ return LOG;
}
@Override
- protected Logging getLogger() {
- return LOG;
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
index 8c4f15e1..ec50aadd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMADNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMADNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,10 +31,13 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.Normalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
@@ -54,7 +57,8 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
* @apiviz.uses NumberVector
*/
// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements Normalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMADNormalization"})
+public class AttributeWiseMADNormalization<V extends NumberVector> implements Normalization<V> {
/**
* Class logger.
*/
@@ -63,7 +67,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
/**
* Number vector factory.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
/**
* Stores the median in each dimension.
@@ -71,9 +75,9 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
private double[] median = new double[0];
/**
- * Stores the median absolute deviation in each dimension.
+ * Stores the inverse median absolute deviation in each dimension.
*/
- private double[] madsigma = new double[0];
+ private double[] imadsigma = new double[0];
/**
* Constructor.
@@ -84,13 +88,13 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (objects.dataLength() == 0) {
+ if(objects.dataLength() == 0) {
return objects;
}
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked")
@@ -103,61 +107,72 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
// Scan to find the best
final int dim = castType.getDimensionality();
median = new double[dim];
- madsigma = new double[dim];
+ imadsigma = new double[dim];
// Scratch space for testing:
double[] test = new double[castColumn.size()];
FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data.", dim, LOG) : null;
// We iterate over dimensions, this kind of filter needs fast random
// access.
- for (int d = 0; d < dim; d++) {
- for (int i = 0; i < test.length; i++) {
+ for(int d = 0; d < dim; d++) {
+ for(int i = 0; i < test.length; i++) {
test[i] = castColumn.get(i).doubleValue(d);
}
final double med = QuickSelect.median(test);
median[d] = med;
- for (int i = 0; i < test.length; i++) {
+ int zeros = 0;
+ for(int i = 0; i < test.length; i++) {
test[i] = Math.abs(test[i] - med);
+ if(test[i] == 0.) {
+ zeros++;
+ }
}
// Rescale the true MAD for the best standard deviation estimate:
- madsigma[d] = QuickSelect.median(test) * NormalDistribution.ONEBYPHIINV075;
- if (dprog != null) {
- dprog.incrementProcessed(LOG);
+ if(zeros < (test.length >>> 1)) {
+ imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
}
+ else if(zeros == test.length) {
+ LOG.warning("Constant attribute detected. Using MAD=1.");
+ imadsigma[d] = 1.; // Does not matter. Constant distribution.
+ }
+ else {
+ // We have more than 50% zeros, so the regular MAD estimate does not
+ // work. Generalize the MAD approach to use the 50% non-zero value:
+ final int rank = zeros + ((test.length - zeros) >> 1);
+ final double rel = .5 + rank * .5 / test.length;
+ imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
+ LOG.warning("Near-constant attribute detected. Using modified MAD.");
+ }
+ LOG.incrementProcessed(dprog);
}
- if (dprog != null) {
- dprog.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(dprog);
FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization.", objects.dataLength(), LOG) : null;
// Normalization scan
double[] buf = new double[dim];
- for (int i = 0; i < objects.dataLength(); i++) {
+ for(int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
buf[d] = normalize(d, obj.doubleValue(d));
}
castColumn.set(i, factory.newNumberVector(buf));
- if (nprog != null) {
- nprog.incrementProcessed(LOG);
- }
- }
- if (nprog != null) {
- nprog.ensureCompleted(LOG);
+ LOG.incrementProcessed(nprog);
}
+ LOG.ensureCompleted(nprog);
}
return objects;
}
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if (featureVector.getDimensionality() == median.length) {
+ if(featureVector.getDimensionality() == median.length) {
double[] values = new double[featureVector.getDimensionality()];
- for (int d = 0; d < featureVector.getDimensionality(); d++) {
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
values[d] = restore(d, featureVector.doubleValue(d));
}
return factory.newNumberVector(values);
- } else {
+ }
+ else {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + median.length);
}
}
@@ -175,7 +190,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
* @return Normalized value
*/
private double normalize(int d, double val) {
- return (val - median[d]) / madsigma[d];
+ return (val - median[d]) * imadsigma[d];
}
/**
@@ -186,7 +201,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
* @return Normalized value
*/
private double restore(int d, double val) {
- return (val * madsigma[d]) + median[d];
+ return (val / imadsigma[d]) + median[d];
}
@Override
@@ -196,7 +211,7 @@ public class AttributeWiseMADNormalization<V extends NumberVector<?>> implements
result.append('\n');
result.append("normalization median: ").append(FormatUtil.format(median));
result.append('\n');
- result.append("normalization MAD sigma: ").append(FormatUtil.format(madsigma));
+ result.append("normalization scaling factor: ").append(FormatUtil.format(imadsigma));
return result.toString();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java
new file mode 100644
index 00000000..1039ab5b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMeanNormalization.java
@@ -0,0 +1,207 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+
+/**
+ * Normalization designed for data with a <em>meaningful zero</em>: Each
+ * attribute is scaled to have the same mean (but 0 is not changed).
+ *
+ * @author Erich Schubert
+ * @param <V> vector type
+ *
+ * @apiviz.uses NumberVector
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMeanNormalization"})
+public class AttributeWiseMeanNormalization<V extends NumberVector> extends AbstractNormalization<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(AttributeWiseMeanNormalization.class);
+
+ /**
+ * Stores the mean in each dimension.
+ */
+ private double[] mean = null;
+
+ /**
+ * Temporary storage used during initialization.
+ */
+ double[] sums = null;
+
+ /**
+ * Count the number of values seen.
+ */
+ int c = 0;
+
+ /**
+ * Constructor.
+ *
+ * @param mean Mean value
+ */
+ public AttributeWiseMeanNormalization(double[] mean) {
+ super();
+ this.mean = mean;
+ }
+
+ /**
+ * Constructor.
+ */
+ public AttributeWiseMeanNormalization() {
+ super();
+ }
+
+ @Override
+ protected boolean prepareStart(SimpleTypeInformation<V> in) {
+ return (mean == null || mean.length == 0);
+ }
+
+ @Override
+ protected void prepareProcessInstance(V featureVector) {
+ // First object? Then init. (We didn't have a dimensionality before!)
+ if(sums == null || sums.length == 0) {
+ int dimensionality = featureVector.getDimensionality();
+ sums = new double[dimensionality];
+ }
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ sums[d] += featureVector.doubleValue(d);
+ }
+ ++c;
+ }
+
+ @Override
+ protected void prepareComplete() {
+ StringBuilder buf = LOG.isVerbose() ? new StringBuilder() : null;
+ final int dimensionality = sums.length;
+ mean = new double[dimensionality];
+ if(buf != null) {
+ buf.append("Normalization parameters: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ mean[d] = sums[d] / c;
+ if(buf != null) {
+ buf.append(" m: ").append(mean[d]);
+ }
+ }
+ sums = null;
+ if(buf != null) {
+ LOG.debugFine(buf.toString());
+ }
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = normalize(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+
+ @Override
+ public V restore(V featureVector) throws NonNumericFeaturesException {
+ if(featureVector.getDimensionality() != mean.length) {
+ throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
+ }
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
+ }
+
+ /**
+ * Normalize a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double normalize(int d, double val) {
+ d = (mean.length == 1) ? 0 : d;
+ return val / mean[d];
+ }
+
+ /**
+ * Restore a single dimension.
+ *
+ * @param d Dimension
+ * @param val Value
+ * @return Normalized value
+ */
+ private double restore(int d, double val) {
+ d = (mean.length == 1) ? 0 : d;
+ return val * mean[d];
+ }
+
+ @Override
+ public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
+ double[][] coeff = linearEquationSystem.getCoefficents();
+ double[] rhs = linearEquationSystem.getRHS();
+ int[] row = linearEquationSystem.getRowPermutations();
+ int[] col = linearEquationSystem.getColumnPermutations();
+
+ for(int i = 0; i < coeff.length; i++) {
+ for(int r = 0; r < coeff.length; r++) {
+ double sum = 0.0;
+ for(int c = 0; c < coeff[0].length; c++) {
+ sum += coeff[row[r]][col[c]] / mean[c];
+ coeff[row[r]][col[c]] = coeff[row[r]][col[c]] / mean[c];
+ }
+ rhs[row[r]] = rhs[row[r]] + sum;
+ }
+ }
+
+ return new LinearEquationSystem(coeff, rhs, row, col);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder();
+ result.append("normalization class: ").append(getClass().getName());
+ result.append('\n');
+ result.append("normalization means: ").append(FormatUtil.format(mean));
+
+ return result.toString();
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java
index 47b6db5f..26a125ad 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseMinMaxNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,11 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -46,24 +49,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParamet
*
* @apiviz.uses NumberVector
*/
-// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseMinMaxNormalization"})
+public class AttributeWiseMinMaxNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(AttributeWiseMinMaxNormalization.class);
/**
- * Parameter for minimum.
- */
- public static final OptionID MINIMA_ID = new OptionID("normalize.min", "a comma separated concatenation of the minimum values in each dimension that are mapped to 0. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
-
- /**
- * Parameter for maximum.
- */
- public static final OptionID MAXIMA_ID = new OptionID("normalize.max", "a comma separated concatenation of the maximum values in each dimension that are mapped to 1. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
-
- /**
* Stores the maximum in each dimension.
*/
private double[] maxima = new double[0];
@@ -130,16 +123,14 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if(featureVector.getDimensionality() == maxima.length && featureVector.getDimensionality() == minima.length) {
- double[] values = new double[featureVector.getDimensionality()];
- for(int d = 0; d < featureVector.getDimensionality(); d++) {
- values[d] = (featureVector.doubleValue(d) * (factor(d)) + minima[d]);
- }
- return factory.newNumberVector(values);
- }
- else {
+ if(featureVector.getDimensionality() != maxima.length || featureVector.getDimensionality() != minima.length) {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + maxima.length);
}
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = featureVector.doubleValue(d) * factor(d) + minima[d];
+ }
+ return factory.newNumberVector(values);
}
/**
@@ -174,8 +165,7 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
}
}
- LinearEquationSystem lq = new LinearEquationSystem(coeff, rhs, row, col);
- return lq;
+ return new LinearEquationSystem(coeff, rhs, row, col);
}
@Override
@@ -190,13 +180,13 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
}
@Override
- protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ protected Logging getLogger() {
+ return LOG;
}
@Override
- protected Logging getLogger() {
- return LOG;
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
}
/**
@@ -206,7 +196,17 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for minimum.
+ */
+ public static final OptionID MINIMA_ID = new OptionID("normalize.min", "a comma separated concatenation of the minimum values in each dimension that are mapped to 0. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for maximum.
+ */
+ public static final OptionID MAXIMA_ID = new OptionID("normalize.max", "a comma separated concatenation of the maximum values in each dimension that are mapped to 1. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
+
/**
* Stores the maximum in each dimension.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java
index a24cae25..a7241441 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/AttributeWiseVarianceNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,9 +26,12 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -48,32 +51,22 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParamet
*
* @apiviz.uses NumberVector
*/
-// TODO: extract superclass AbstractAttributeWiseNormalization
-public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.AttributeWiseVarianceNormalization", "z" })
+public class AttributeWiseVarianceNormalization<V extends NumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(AttributeWiseVarianceNormalization.class);
/**
- * Parameter for means.
- */
- public static final OptionID MEAN_ID = new OptionID("normalize.mean", "a comma separated concatenation of the mean values in each dimension that are mapped to 0. If no value is specified, the mean value of the attribute range in this dimension will be taken.");
-
- /**
- * Parameter for stddevs.
- */
- public static final OptionID STDDEV_ID = new OptionID("normalize.stddev", "a comma separated concatenation of the standard deviations in each dimension that are scaled to 1. If no value is specified, the standard deviation of the attribute range in this dimension will be taken.");
-
- /**
* Stores the mean in each dimension.
*/
- private double[] mean = new double[0];
+ private double[] mean;
/**
* Stores the standard deviation in each dimension.
*/
- private double[] stddev = new double[0];
+ private double[] stddev;
/**
* Temporary storage used during initialization.
@@ -152,16 +145,14 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
@Override
public V restore(V featureVector) throws NonNumericFeaturesException {
- if(featureVector.getDimensionality() == mean.length) {
- double[] values = new double[featureVector.getDimensionality()];
- for(int d = 0; d < featureVector.getDimensionality(); d++) {
- values[d] = restore(d, featureVector.doubleValue(d));
- }
- return factory.newNumberVector(values);
- }
- else {
+ if(featureVector.getDimensionality() != mean.length) {
throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + mean.length);
}
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ values[d] = restore(d, featureVector.doubleValue(d));
+ }
+ return factory.newNumberVector(values);
}
/**
@@ -172,12 +163,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
* @return Normalized value
*/
private double normalize(int d, double val) {
- if(mean.length == 1) {
- return (val - mean[0]) / stddev[0];
- }
- else {
- return (val - mean[d]) / stddev[d];
- }
+ d = (mean.length == 1) ? 0 : d;
+ return (val - mean[d]) / stddev[d];
}
/**
@@ -188,12 +175,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
* @return Normalized value
*/
private double restore(int d, double val) {
- if(mean.length == 1) {
- return (val * stddev[0]) + mean[0];
- }
- else {
- return (val * stddev[d]) + mean[d];
- }
+ d = (mean.length == 1) ? 0 : d;
+ return (val * stddev[d]) + mean[d];
}
@Override
@@ -214,13 +197,7 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
}
}
- LinearEquationSystem lq = new LinearEquationSystem(coeff, rhs, row, col);
- return lq;
- }
-
- @Override
- protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ return new LinearEquationSystem(coeff, rhs, row, col);
}
@Override
@@ -240,6 +217,11 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
return LOG;
}
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+
/**
* Parameterization class.
*
@@ -247,7 +229,17 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for means.
+ */
+ public static final OptionID MEAN_ID = new OptionID("normalize.mean", "a comma separated concatenation of the mean values in each dimension that are mapped to 0. If no value is specified, the mean value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for stddevs.
+ */
+ public static final OptionID STDDEV_ID = new OptionID("normalize.stddev", "a comma separated concatenation of the standard deviations in each dimension that are scaled to 1. If no value is specified, the standard deviation of the attribute range in this dimension will be taken.");
+
/**
* Stores the mean in each dimension.
*/
@@ -261,22 +253,22 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- DoubleListParameter meanP = new DoubleListParameter(MEAN_ID, true);
- DoubleListParameter stddevP = new DoubleListParameter(STDDEV_ID, true);
- config.grab(meanP);
- config.grab(stddevP);
- // Note: grab first, then use isDefined, to ensure the stddev is grabbed.
- if(meanP.isDefined() && stddevP.isDefined()) {
+ DoubleListParameter meanP = new DoubleListParameter(MEAN_ID) //
+ .setOptional(true);
+ if(config.grab(meanP)) {
mean = ArrayLikeUtil.toPrimitiveDoubleArray(meanP.getValue());
+ }
+ DoubleListParameter stddevP = new DoubleListParameter(STDDEV_ID) //
+ .setOptional(true);
+ if(config.grab(stddevP)) {
stddev = ArrayLikeUtil.toPrimitiveDoubleArray(stddevP.getValue());
for(double d : stddev) {
- if(d == 0) {
+ if(d == 0.) {
config.reportError(new WrongParameterValueException("Standard deviations must not be 0."));
}
}
}
-
config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(meanP, stddevP));
config.checkConstraint(new EqualSizeGlobalConstraint(meanP, stddevP));
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java
index bb9c2aec..ca320ec6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/RankTieNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/IntegerRankTieNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
*/
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.IntegerVector;
@@ -34,21 +33,25 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
/**
* Normalize vectors according to their rank in the attributes.
*
- * Note: ranks are multiplied by 2, to be able to give ties an integer rank.
- * (e.g. first two records are tied at "1" then, followed by the next on "4")
+ * Note: <b>ranks are multiplied by 2</b>, to be able to give ties an integer
+ * rank. (e.g. when the first two records are tied, they both have rank "1"
+ * then, followed by the next on "4")
*
* @author Erich Schubert
*/
-public class RankTieNormalization implements ObjectFilter {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.IntegerRankTieNormalization"})
+public class IntegerRankTieNormalization implements ObjectFilter {
/**
* Constructor.
*/
- public RankTieNormalization() {
+ public IntegerRankTieNormalization() {
super();
}
@@ -57,6 +60,12 @@ public class RankTieNormalization implements ObjectFilter {
final int len = objects.dataLength();
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+ int[] order = new int[len];
+ for(int i = 0; i < len; i++) {
+ order[i] = i;
+ }
+ Sorter comparator = new Sorter();
+
for(int r = 0; r < objects.metaLength(); r++) {
final SimpleTypeInformation<?> type = objects.meta(r);
final List<?> column = objects.getColumn(r);
@@ -65,7 +74,7 @@ public class RankTieNormalization implements ObjectFilter {
continue;
}
@SuppressWarnings("unchecked")
- final List<? extends NumberVector<?>> castColumn = (List<? extends NumberVector<?>>) column;
+ final List<? extends NumberVector> castColumn = (List<? extends NumberVector>) column;
// Get the replacement type information
final int dim = ((VectorFieldTypeInformation<?>) type).getDimensionality();
final VectorFieldTypeInformation<IntegerVector> outType = new VectorFieldTypeInformation<>(IntegerVector.STATIC, dim);
@@ -73,29 +82,21 @@ public class RankTieNormalization implements ObjectFilter {
// Output vectors
int[][] posvecs = new int[len][dim];
// Sort for each dimension
- // TODO: an int[] array would be enough, if we could use a comparator...
- DoubleIntPair[] sorter = new DoubleIntPair[len];
- for(int i = 0; i < sorter.length; i++) {
- sorter[i] = new DoubleIntPair(Double.NaN, -1);
- }
for(int d = 0; d < dim; d++) {
- // fill array
- for(int i = 0; i < sorter.length; i++) {
- sorter[i].first = castColumn.get(i).doubleValue(d);
- sorter[i].second = i;
- }
// Sort
- Arrays.sort(sorter);
+ comparator.setup(castColumn, d);
+ IntegerArrayQuickSort.sort(order, comparator);
// Transfer positions to output vectors
- for(int sta = 0; sta < sorter.length;) {
+ for(int sta = 0; sta < order.length;) {
+ double v = castColumn.get(order[sta]).doubleValue(d);
// Compute ties
int end = sta + 1;
- while(end < sorter.length && !(sorter[sta].first < sorter[end].first)) {
+ while(end < order.length && !(v < castColumn.get(order[end]).doubleValue(d))) {
end++;
}
final int pos = (sta + end - 1);
for(int i = sta; i < end; i++) {
- posvecs[sorter[i].second][d] = pos;
+ posvecs[order[i]][d] = pos;
}
sta = end;
}
@@ -110,4 +111,40 @@ public class RankTieNormalization implements ObjectFilter {
}
return bundle;
}
+
+ /**
+ * Class to sort an index array by a particular dimension.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Sorter implements IntegerComparator {
+ /**
+ * Column to use for sorting.
+ */
+ List<? extends NumberVector> col;
+
+ /**
+ * Dimension to use for sorting.
+ */
+ int dim;
+
+ /**
+ * Configure the sorting class.
+ *
+ * @param col Column to read
+ * @param dim Dimension to use.
+ */
+ public void setup(List<? extends NumberVector> col, int dim) {
+ this.col = col;
+ this.dim = dim;
+ }
+
+ @Override
+ public int compare(int x, int y) {
+ final double vx = col.get(x).doubleValue(dim), vy = col.get(y).doubleValue(dim);
+ return (vx < vy) ? -1 : (vx == vy) ? 0 : +1;
+ }
+ }
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java
index 21263890..99054f83 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/InverseDocumentFrequencyNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,11 +29,13 @@ import gnu.trove.map.hash.TIntDoubleHashMap;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractNormalization;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
- * Normalization for text frequency vectors, using the inverse document
- * frequency.
+ * Normalization for text frequency (TF) vectors, using the inverse document
+ * frequency (IDF). See also: TF-IDF for text analysis.
*
* @author Erich Schubert
*
@@ -41,7 +43,8 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
*
* @param <V> Vector type
*/
-public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<?>> extends AbstractNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.InverseDocumentFrequencyNormalization" })
+public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector> extends AbstractNormalization<V> {
/**
* Class logger.
*/
@@ -102,7 +105,7 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<
final int dim = featureVector.iterDim(it);
vals.put(dim, featureVector.iterDoubleValue(it) * idf.get(dim));
}
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
+ return ((SparseNumberVector.Factory<V>) factory).newNumberVector(vals, featureVector.getDimensionality());
}
@Override
@@ -112,12 +115,12 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector<
final int dim = featureVector.iterDim(it);
vals.put(dim, featureVector.iterDoubleValue(it) / idf.get(dim));
}
- return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality());
+ return ((SparseNumberVector.Factory<V>) factory).newNumberVector(vals, featureVector.getDimensionality());
}
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.SPARSE_VECTOR_FIELD;
+ return TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH;
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java
new file mode 100644
index 00000000..f1fac885
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/columnwise/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Normalizations operating on columns / variates; where each column is treated independently.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.columnwise; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java
new file mode 100644
index 00000000..b2da96a9
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/HellingerHistogramNormalization.java
@@ -0,0 +1,97 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Normalize histograms by scaling them to L1 norm 1, then taking the square
+ * root in each attribute.
+ *
+ * Using Euclidean distance (linear kernel) and this transformation is the same
+ * as using Hellinger distance:
+ * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.probabilistic.HellingerDistanceFunction}
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.HellingerHistogramNormalization" })
+public class HellingerHistogramNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Static instance.
+ */
+ public static final HellingerHistogramNormalization<NumberVector> STATIC = new HellingerHistogramNormalization<>();
+
+ /**
+ * Constructor.
+ */
+ public HellingerHistogramNormalization() {
+ super();
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] data = new double[featureVector.getDimensionality()];
+ double sum = 0.;
+ for(int d = 0; d < data.length; ++d) {
+ data[d] = featureVector.doubleValue(d);
+ data[d] = data[d] > 0 ? data[d] : -data[d];
+ sum += data[d];
+ }
+ // Normalize and sqrt:
+ if(sum > 0.) {
+ for(int d = 0; d < data.length; ++d) {
+ if(data[d] > 0) {
+ data[d] = Math.sqrt(data[d] / sum);
+ }
+ }
+ }
+ return factory.newNumberVector(data);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected HellingerHistogramNormalization<NumberVector> makeInstance() {
+ return STATIC;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java
new file mode 100644
index 00000000..05485909
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMeanVarianceNormalization.java
@@ -0,0 +1,159 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Normalize vectors such that they have zero mean and unit variance.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class InstanceMeanVarianceNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Multiplicity of the vector.
+ */
+ private int multiplicity;
+
+ /**
+ * Constructor.
+ */
+ public InstanceMeanVarianceNormalization() {
+ super();
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] raw = featureVector.getColumnVector().getArrayRef();
+ if(raw.length == 0) {
+ return factory.newNumberVector(new double[] {});
+ }
+ if(raw.length == 1) {
+ // Constant, but preserve NaNs
+ return factory.newNumberVector(new double[] { raw[0] == raw[0] ? 0. : Double.NaN });
+ }
+ // Multivariate codepath:
+ if(multiplicity > 1) {
+ assert (raw.length % multiplicity == 0) : "Vector length is not divisible by multiplicity?";
+ return factory.newNumberVector(multivariateStandardization(raw));
+ }
+ return factory.newNumberVector(univariateStandardization(raw));
+ }
+
+ protected double[] univariateStandardization(double[] raw) {
+ // Two pass normalization is numerically most stable,
+ // And Java should optimize this well enough.
+ double sum = 0.;
+ for(int i = 0; i < raw.length; ++i) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ sum += v;
+ }
+ final double mean = sum / raw.length;
+ double ssum = 0.;
+ for(int i = 0; i < raw.length; ++i) {
+ double v = raw[i] - mean;
+ if(v != v) {
+ continue;
+ }
+ ssum += v * v;
+ }
+ final double std = Math.sqrt(ssum) / (raw.length - 1);
+ if(std > 0.) {
+ for(int i = 0; i < raw.length; ++i) {
+ raw[i] = (raw[i] - mean) / std;
+ }
+ }
+ return raw;
+ }
+
+ protected double[] multivariateStandardization(double[] raw) {
+ final int len = raw.length / multiplicity;
+ if(len <= 1) {
+ return raw;
+ }
+ // Two pass normalization is numerically most stable,
+ // And Java should optimize this well enough.
+ double[] mean = new double[multiplicity];
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mean[j] += v;
+ }
+ for(int j = 0; j < multiplicity; ++j) {
+ mean[j] /= len;
+ }
+ double[] std = new double[multiplicity];
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ double v = raw[i] - mean[j];
+ if(v != v) {
+ continue;
+ }
+ std[j] += v * v;
+ }
+ for(int j = 0; j < multiplicity; ++j) {
+ std[j] = std[j] > 0. ? Math.sqrt(std[j]) / (len - 1) : 1;
+ }
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ raw[i] = (raw[i] - mean[j]) / std[j];
+ }
+ return raw;
+ }
+
+ @Override
+ protected void initializeOutputType(SimpleTypeInformation<V> type) {
+ super.initializeOutputType(type);
+ multiplicity = ((VectorTypeInformation<?>) type).getMultiplicity();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ @Override
+ protected InstanceMeanVarianceNormalization<V> makeInstance() {
+ return new InstanceMeanVarianceNormalization<>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java
new file mode 100644
index 00000000..9f8f7680
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/InstanceMinMaxNormalization.java
@@ -0,0 +1,177 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+
+/**
+ * Normalize vectors such that the smallest attribute is 0, the largest is 1.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+public class InstanceMinMaxNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Minimum and maximum values.
+ */
+ private double min, max;
+
+ /**
+ * Multiplicity of the vector.
+ */
+ private int multiplicity;
+
+ /**
+ * Constructor.
+ *
+ * @param min Desired minimum value
+ * @param max Desired maximum value
+ */
+ public InstanceMinMaxNormalization(double min, double max) {
+ super();
+ this.min = min;
+ this.max = max;
+ }
+
+ /**
+ * Constructor, normalizing to {@code [0;1]}
+ */
+ public InstanceMinMaxNormalization() {
+ this(0., 1.);
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] raw = featureVector.getColumnVector().getArrayRef();
+ // Multivariate codepath:
+ if(multiplicity > 1) {
+ assert (raw.length % multiplicity == 0) : "Vector length is not divisible by multiplicity?";
+ double[] mi = new double[multiplicity], ma = new double[multiplicity];
+ for(int i = 0; i < multiplicity; i++) {
+ mi[i] = Double.POSITIVE_INFINITY;
+ ma[i] = Double.NEGATIVE_INFINITY;
+ }
+ for(int i = 0, j = 0; i < raw.length; ++i, j = ++j % multiplicity) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mi[j] = (mi[j] < v) ? mi[j] : v;
+ ma[j] = (ma[j] > v) ? ma[j] : v;
+ }
+ for(int j = 0; j < multiplicity; j++) {
+ if(mi[j] < ma[j]) {
+ final double s = (max - min) / (ma[j] - mi[j]);
+ for(int i = 0; i < raw.length; i += multiplicity) {
+ raw[i] = (raw[i] - mi[j]) * s + min;
+ }
+ }
+ }
+ return factory.newNumberVector(raw);
+ }
+ // Default codepath
+ double mi = Double.POSITIVE_INFINITY, ma = Double.NEGATIVE_INFINITY;
+ for(int i = 0; i < raw.length; ++i) {
+ final double v = raw[i];
+ if(v != v) { // NaN guard
+ continue;
+ }
+ mi = (mi < v) ? mi : v;
+ ma = (ma > v) ? ma : v;
+ }
+ if(mi < ma) {
+ final double s = (max - min) / (ma - mi);
+ for(int i = 0; i < raw.length; ++i) {
+ raw[i] = (raw[i] - mi) * s + min;
+ }
+ }
+ return factory.newNumberVector(raw);
+ }
+
+ @Override
+ protected void initializeOutputType(SimpleTypeInformation<V> type) {
+ super.initializeOutputType(type);
+ multiplicity = ((VectorTypeInformation<?>) type).getMultiplicity();
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Option ID for minimum value.
+ */
+ public static final OptionID MIN_ID = new OptionID("normalization.min", "Minimum value to assign to objects.");
+
+ /**
+ * Option ID for maximum value.
+ */
+ public static final OptionID MAX_ID = new OptionID("normalization.max", "Maximum value to assign to objects.");
+
+ /**
+ * Minimum and maximum values.
+ */
+ private double min, max;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ DoubleParameter minP = new DoubleParameter(MIN_ID, 0.) //
+ .setOptional(true);
+ if(config.grab(minP)) {
+ min = minP.doubleValue();
+ }
+ DoubleParameter maxP = new DoubleParameter(MAX_ID, 1.) //
+ .setOptional(true);
+ if(config.grab(maxP)) {
+ max = maxP.doubleValue();
+ }
+ config.checkConstraint(new LessGlobalConstraint<>(minP, maxP));
+ }
+
+ @Override
+ protected InstanceMinMaxNormalization<V> makeInstance() {
+ return new InstanceMinMaxNormalization<>(min, max);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java
index a12dea3b..51b2a34b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/LengthNormalization.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/LengthNormalization.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,9 +26,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.DoubleNorm;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.Norm;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -42,42 +43,32 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> vector type
*/
-public class LengthNormalization<V extends NumberVector<?>> extends AbstractStreamNormalization<V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.LengthNormalization"})
+public class LengthNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
/**
* Norm to use.
*/
- DoubleNorm<? super V> norm;
+ Norm<? super V> norm;
/**
* Constructor.
*
* @param norm Norm to use
*/
- public LengthNormalization(DoubleNorm<? super V> norm) {
+ public LengthNormalization(Norm<? super V> norm) {
super();
this.norm = norm;
}
@Override
protected V filterSingleObject(V featureVector) {
- final double d = norm.doubleNorm(featureVector);
+ final double d = norm.norm(featureVector);
return factory.newNumberVector(featureVector.getColumnVector().timesEquals(1 / d).getArrayRef());
}
@Override
- public V restore(V featureVector) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public LinearEquationSystem transform(LinearEquationSystem linearEquationSystem) {
- // TODO.
- throw new UnsupportedOperationException();
- }
-
- @Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return TypeUtil.NUMBER_VECTOR_FIELD;
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
}
/**
@@ -87,7 +78,7 @@ public class LengthNormalization<V extends NumberVector<?>> extends AbstractStre
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Option ID for normalization norm.
*/
@@ -96,12 +87,12 @@ public class LengthNormalization<V extends NumberVector<?>> extends AbstractStre
/**
* Norm to use.
*/
- DoubleNorm<? super V> norm;
+ Norm<? super V> norm;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- ObjectParameter<DoubleNorm<? super V>> normP = new ObjectParameter<>(NORM_ID, DoubleNorm.class, EuclideanDistanceFunction.class);
+ ObjectParameter<Norm<? super V>> normP = new ObjectParameter<>(NORM_ID, Norm.class, EuclideanDistanceFunction.class);
if(config.grab(normP)) {
norm = normP.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java
new file mode 100644
index 00000000..8970e7ef
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/Log1PlusNormalization.java
@@ -0,0 +1,119 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.AbstractStreamNormalization;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+
+/**
+ * Normalize the data set by applying log(1+|x|*b)/log(b+1) to any value. If the
+ * input data was in [0;1], then the resulting values will be in the same range.
+ *
+ * By default b=1, and thus the transformation is log2(1+|x|).
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> vector type
+ */
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.Log1PlusNormalization" })
+public class Log1PlusNormalization<V extends NumberVector> extends AbstractStreamNormalization<V> {
+ /**
+ * Static instance.
+ */
+ public static final Log1PlusNormalization<NumberVector> STATIC = new Log1PlusNormalization<>(1.);
+
+ /**
+ * Boosting factor, and scaling coefficient.
+ */
+ protected double boost, scale;
+
+ /**
+ * Constructor.
+ *
+ * @param boost Boosting parameter
+ */
+ public Log1PlusNormalization(double boost) {
+ super();
+ this.boost = boost;
+ this.scale = 1. / Math.log1p(boost);
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ double[] data = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < data.length; ++d) {
+ data[d] = featureVector.doubleValue(d);
+ data[d] = Math.log1p((data[d] > 0 ? data[d] : -data[d]) * boost) * scale;
+ }
+ return factory.newNumberVector(data);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Boosting factor parameter.
+ */
+ public static final OptionID BOOST_ID = new OptionID("log1pscale.boost", "Boosting factor. Larger values will yield a steeper curve.");
+
+ /**
+ * Boosting factor.
+ */
+ protected double boost;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ DoubleParameter boostP = new DoubleParameter(BOOST_ID, 1.) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(boostP)) {
+ boost = boostP.doubleValue();
+ }
+ }
+
+ @Override
+ protected Log1PlusNormalization<V> makeInstance() {
+ return new Log1PlusNormalization<>(boost);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java
new file mode 100644
index 00000000..9ac613c0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/instancewise/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Instancewise normalization, where each instance is normalized independently.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.normalization.instancewise; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
index 15d689d7..552d7003 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
index 87684499..249c3764 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/package-info.java
@@ -2,12 +2,13 @@
* <p>Data filtering, in particular for normalization and projection.</p>
*
* @apiviz.exclude de.lmu.ifi.dbs.elki.utilities.*
+ * @apiviz.exclude de.lmu.ifi.dbs.elki.datasource.filter\.(normalization|transform)\.*
*/
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ByLabelFilter.java
index 66707da6..8683ca8c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ByLabelFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,12 +23,15 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -36,12 +39,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
/**
- * A filter to sort the data set by some label.
+ * A filter to select data set by their label.
*
* @author Erich Schubert
*
* @apiviz.uses LabelList oneway - - «reads»
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ByLabelFilter" })
public class ByLabelFilter extends AbstractStreamFilter {
/**
* Class logger
@@ -49,9 +53,9 @@ public class ByLabelFilter extends AbstractStreamFilter {
private static final Logging LOG = Logging.getLogger(ByLabelFilter.class);
/**
- * The filter pattern
+ * The filter pattern matcher
*/
- private final Pattern pattern;
+ private final Matcher matcher;
/**
* Inversion flag
@@ -71,7 +75,7 @@ public class ByLabelFilter extends AbstractStreamFilter {
*/
public ByLabelFilter(Pattern pattern, boolean inverted) {
super();
- this.pattern = pattern;
+ this.matcher = pattern.matcher("");
this.inverted = inverted;
}
@@ -91,7 +95,7 @@ public class ByLabelFilter extends AbstractStreamFilter {
Event ev = source.nextEvent();
switch(ev){
case END_OF_STREAM:
- if (lblcol < 0) {
+ if(lblcol < 0) {
LOG.warning("By label filter was used, but never saw a label relation!");
}
return Event.END_OF_STREAM;
@@ -114,7 +118,8 @@ public class ByLabelFilter extends AbstractStreamFilter {
boolean good = false;
final LabelList ll = (LabelList) l;
for(int i = 0; i < ll.size(); i++) {
- if(pattern.matcher(ll.get(i)).matches()) {
+ matcher.reset(ll.get(i));
+ if(matcher.matches()) {
good = true;
break;
}
@@ -124,7 +129,8 @@ public class ByLabelFilter extends AbstractStreamFilter {
}
}
else {
- if(!pattern.matcher(l.toString()).matches()) {
+ matcher.reset(l.toString());
+ if(!matcher.matches()) {
continue;
}
}
@@ -190,7 +196,7 @@ public class ByLabelFilter extends AbstractStreamFilter {
}
@Override
- protected Object makeInstance() {
+ protected ByLabelFilter makeInstance() {
return new ByLabelFilter(pattern, inverted);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/RandomSamplingStreamFilter.java
index a7e44d4d..3e1a3d89 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/RandomSamplingStreamFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,9 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
import java.util.Random;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
@@ -39,6 +41,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.RandomSamplingStreamFilter" })
public class RandomSamplingStreamFilter extends AbstractStreamFilter {
/**
* Probability
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ShuffleObjectsFilter.java
index 8afa8290..3fb77ce4 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/ShuffleObjectsFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,8 +28,10 @@ import java.util.List;
import java.util.Random;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -40,6 +42,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.ShuffleObjectsFilter" })
public class ShuffleObjectsFilter implements ObjectFilter {
/**
* Class logger
@@ -73,18 +76,18 @@ public class ShuffleObjectsFilter implements ObjectFilter {
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Shuffling the data set");
}
final Random random = rnd.getSingleThreadedRandom();
final int size = objects.dataLength();
final int[] offsets = new int[size];
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
offsets[i] = i;
}
// Randomize the offset array
- for (int i = size; i > 1; i--) {
+ for(int i = size; i > 1; i--) {
final int j = random.nextInt(i);
// Swap the elements at positions j and i - 1:
final int temp = offsets[j];
@@ -93,11 +96,11 @@ public class ShuffleObjectsFilter implements ObjectFilter {
}
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
// Reorder column accordingly
List<?> in = objects.getColumn(j);
List<Object> data = new ArrayList<>(size);
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
data.add(in.get(offsets[i]));
}
bundle.appendColumn(objects.meta(j), data);
@@ -119,13 +122,13 @@ public class ShuffleObjectsFilter implements ObjectFilter {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
@Override
- protected Object makeInstance() {
+ protected ShuffleObjectsFilter makeInstance() {
return new ShuffleObjectsFilter(rnd);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/SortByLabelFilter.java
index d35d9cde..a6cef5fd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SortByLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/SortByLabelFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.selection;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,7 +28,9 @@ import java.util.List;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
@@ -39,6 +41,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
*
* @apiviz.uses de.lmu.ifi.dbs.elki.data.LabelList oneway - - «reads»
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.SortByLabelFilter" })
public class SortByLabelFilter implements ObjectFilter {
/**
* Class logger
@@ -54,22 +57,22 @@ public class SortByLabelFilter implements ObjectFilter {
@Override
public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) {
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Shuffling the data set");
}
// Prepare a reposition array for cheap resorting
final int size = objects.dataLength();
final int[] offsets = new int[size];
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
offsets[i] = i;
}
// Sort by labels - identify a label column
final int lblcol;
{
int lblc = -1;
- for (int i = 0; i < objects.metaLength(); i++) {
- if (TypeUtil.GUESSED_LABEL.isAssignableFromType(objects.meta(i))) {
+ for(int i = 0; i < objects.metaLength(); i++) {
+ if(TypeUtil.GUESSED_LABEL.isAssignableFromType(objects.meta(i))) {
lblc = i;
break;
}
@@ -86,11 +89,11 @@ public class SortByLabelFilter implements ObjectFilter {
});
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
- for (int j = 0; j < objects.metaLength(); j++) {
+ for(int j = 0; j < objects.metaLength(); j++) {
// Reorder column accordingly
List<?> in = objects.getColumn(j);
List<Object> data = new ArrayList<>(size);
- for (int i = 0; i < size; i++) {
+ for(int i = 0; i < size; i++) {
data.add(in.get(offsets[i]));
}
bundle.appendColumn(objects.meta(j), data);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java
new file mode 100644
index 00000000..7ec0a3a3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/selection/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Filters for selecting and sorting data to process.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.selection; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java
index 462db9eb..8c1ef6cb 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,13 +33,12 @@ import java.util.Map;
import de.lmu.ifi.dbs.elki.data.ClassLabel;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.NumberVector.Factory;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.filter.ClassLabelFilter;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.typeconversions.ClassLabelFilter;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
@@ -60,7 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*
* @param <V> Vector type
*/
-public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberVector<?>> implements ObjectFilter {
+public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberVector> implements ObjectFilter {
/**
* The dimensionality to which the data should be reduced.
*/
@@ -114,7 +113,7 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV
List<V> vectorcolumn = (List<V>) column;
final VectorFieldTypeInformation<?> vtype = (VectorFieldTypeInformation<?>) type;
@SuppressWarnings("unchecked")
- NumberVector.Factory<V, ?> factory = (NumberVector.Factory<V, ?>) vtype.getFactory();
+ NumberVector.Factory<V> factory = (NumberVector.Factory<V> ) vtype.getFactory();
int dim = vtype.getDimensionality();
if(tdim > dim) {
@@ -155,7 +154,7 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV
* @param factory Vector factory
* @return output type restriction
*/
- protected SimpleTypeInformation<?> convertedType(SimpleTypeInformation<?> in, Factory<V, ?> factory) {
+ protected SimpleTypeInformation<?> convertedType(SimpleTypeInformation<?> in, NumberVector.Factory<V> factory) {
return new VectorFieldTypeInformation<>(factory, tdim);
}
@@ -206,7 +205,7 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV
*
* @param <V> Vector type
*/
- public abstract static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public abstract static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* The number of dimensions to keep.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java
index d646b489..32024581 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ClassicMultidimensionalScalingTransform.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -30,8 +30,9 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
@@ -54,6 +55,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.composedOf SingularValueDecomposition
+ *
* @param <O> Data type
*/
@Alias({ "mds" })
@@ -66,7 +69,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
/**
* Distance function to use.
*/
- PrimitiveDoubleDistanceFunction<? super O> dist = null;
+ PrimitiveDistanceFunction<? super O> dist = null;
/**
* Target dimensionality
@@ -79,7 +82,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
* @param tdim Target dimensionality.
* @param dist Distance function to use.
*/
- public ClassicMultidimensionalScalingTransform(int tdim, PrimitiveDoubleDistanceFunction<? super O> dist) {
+ public ClassicMultidimensionalScalingTransform(int tdim, PrimitiveDistanceFunction<? super O> dist) {
super();
this.tdim = tdim;
this.dist = dist;
@@ -105,14 +108,14 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
// Get the replacement type information
@SuppressWarnings("unchecked")
final List<O> castColumn = (List<O>) column;
- NumberVector.Factory<? extends NumberVector<?>, ?> factory = null;
+ NumberVector.Factory<? extends NumberVector> factory = null;
{
if (type instanceof VectorFieldTypeInformation) {
final VectorFieldTypeInformation<?> ctype = (VectorFieldTypeInformation<?>) type;
// Note two-step cast, to make stricter compilers happy.
@SuppressWarnings("unchecked")
- final VectorFieldTypeInformation<? extends NumberVector<?>> vtype = (VectorFieldTypeInformation<? extends NumberVector<?>>) ctype;
- factory = (NumberVector.Factory<? extends NumberVector<?>, ?>) vtype.getFactory();
+ final VectorFieldTypeInformation<? extends NumberVector> vtype = (VectorFieldTypeInformation<? extends NumberVector>) ctype;
+ factory = FilterUtil.guessFactory(vtype);
} else {
factory = DoubleVector.FACTORY;
}
@@ -128,16 +131,12 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
final O ox = castColumn.get(x);
for (int y = x + 1; y < size; y++) {
final O oy = castColumn.get(y);
- double distance = Math.abs(dist.doubleDistance(ox, oy));
+ double distance = Math.abs(dist.distance(ox, oy));
imat[x][y] = distance;
- if (dprog != null) {
- dprog.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(dprog);
}
}
- if (dprog != null) {
- dprog.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(dprog);
}
// Adjust distance matrix:
if (dist instanceof SquaredEuclideanDistanceFunction) {
@@ -230,7 +229,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
*
* @apiviz.exclude
*/
- public static class Parameterizer<O extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<O extends NumberVector> extends AbstractParameterizer {
/**
* Desired dimensionality.
*/
@@ -249,7 +248,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
/**
* Distance function to use.
*/
- PrimitiveDoubleDistanceFunction<? super O> dist = null;
+ PrimitiveDistanceFunction<? super O> dist = null;
@Override
protected void makeOptions(Parameterization config) {
@@ -260,7 +259,7 @@ public class ClassicMultidimensionalScalingTransform<O> implements ObjectFilter
tdim = dimP.intValue();
}
- ObjectParameter<PrimitiveDoubleDistanceFunction<? super O>> distP = new ObjectParameter<>(DISTANCE_ID, PrimitiveDoubleDistanceFunction.class, SquaredEuclideanDistanceFunction.class);
+ ObjectParameter<PrimitiveDistanceFunction<? super O>> distP = new ObjectParameter<>(DISTANCE_ID, PrimitiveDistanceFunction.class, SquaredEuclideanDistanceFunction.class);
if (config.grab(distP)) {
dist = distP.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java
index 3b4193ad..c6bd02a9 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/GlobalPrincipalComponentAnalysisTransform.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -55,10 +55,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.composedOf PCARunner
+ * @apiviz.composedOf CovarianceMatrix
+ * @apiviz.composedOf EigenPairFilter
+ *
* @param <O> Vector type
*/
@Alias({ "whiten", "whitening", "pca" })
-public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector<?>> extends AbstractVectorConversionFilter<O, O> {
+public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector> extends AbstractVectorConversionFilter<O, O> {
/**
* Class logger.
*/
@@ -122,7 +126,7 @@ public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector<?>
@Override
protected void prepareComplete() {
mean = covmat.getMeanVector().getArrayRef();
- PCAResult pcares = (new PCARunner<O>(null)).processCovarMatrix(covmat.destroyToSampleMatrix());
+ PCAResult pcares = (new PCARunner(null)).processCovarMatrix(covmat.destroyToSampleMatrix());
SortedEigenPairs eps = pcares.getEigenPairs();
covmat = null;
@@ -190,7 +194,7 @@ public class GlobalPrincipalComponentAnalysisTransform<O extends NumberVector<?>
*
* @apiviz.exclude
*/
- public static class Parameterizer<O extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<O extends NumberVector> extends AbstractParameterizer {
/**
* To specify the eigenvectors to keep.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/HistogramJitterFilter.java
index 453d294e..8c34ce37 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/HistogramJitterFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.transform;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,8 +26,9 @@ import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.ExponentialDistribution;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -55,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* @param <V> Vector type
*/
@Description("Add uniform Jitter to a dataset, while preserving the total vector sum.")
-public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<V, V> {
+public class HistogramJitterFilter<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> {
/**
* Jitter amount.
*/
@@ -145,8 +146,8 @@ public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVe
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- DoubleParameter jitterP = new DoubleParameter(JITTER_ID);
- jitterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ DoubleParameter jitterP = new DoubleParameter(JITTER_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(jitterP)) {
jitter = jitterP.getValue().doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java
index 998c8931..9cb0b492 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LatLngToECEFFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,8 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.math.geodesy.EarthModel;
import de.lmu.ifi.dbs.elki.math.geodesy.SphericalVincentyEarthModel;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -39,13 +41,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.uses NumberVector
+ * @apiviz.composedOf EarthModel
+ *
* @param <V> Vector type.
*/
-public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStreamConversionFilter<V, V> {
+public class LatLngToECEFFilter<V extends NumberVector> extends AbstractStreamConversionFilter<V, V> {
/**
* Vector factory to use.
*/
- private NumberVector.Factory<V, ?> factory;
+ private NumberVector.Factory<V> factory;
/**
* Earth model to use.
@@ -69,14 +74,13 @@ public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return new VectorFieldTypeInformation<>(NumberVector.class, 2, 2);
+ return TypeUtil.NUMBER_VECTOR_FIELD_2D;
}
@Override
protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
- VectorFieldTypeInformation<V> vin = (VectorFieldTypeInformation<V>) in;
- factory = (NumberVector.Factory<V, ?>) vin.getFactory();
- return new VectorFieldTypeInformation<>(vin.getFactory(), 3, 3, in.getSerializer());
+ factory = FilterUtil.guessFactory(in);
+ return new VectorFieldTypeInformation<>(factory, 3, 3, in.getSerializer());
}
/**
@@ -88,7 +92,7 @@ public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
*
* @param <V> Vector type
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Earth model to use.
*/
@@ -98,7 +102,7 @@ public class LatLngToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<EarthModel> modelP = new ObjectParameter<>(EarthModel.MODEL_ID, EarthModel.class, SphericalVincentyEarthModel.class);
- if (config.grab(modelP)) {
+ if(config.grab(modelP)) {
model = modelP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java
index 76546d5c..537bfb20 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LinearDiscriminantAnalysisFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -59,7 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
*/
@Alias("lda")
@Reference(authors = "R. A. Fisher", title = "The use of multiple measurements in taxonomic problems", booktitle = "Annals of eugenics 7.2 (1936)", url = "http://dx.doi.org/10.1111/j.1469-1809.1936.tb02137.x")
-public class LinearDiscriminantAnalysisFilter<V extends NumberVector<?>> extends AbstractSupervisedProjectionVectorFilter<V> {
+public class LinearDiscriminantAnalysisFilter<V extends NumberVector> extends AbstractSupervisedProjectionVectorFilter<V> {
/**
* Class logger.
*/
@@ -156,7 +156,7 @@ public class LinearDiscriminantAnalysisFilter<V extends NumberVector<?>> extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractSupervisedProjectionVectorFilter.Parameterizer<V> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractSupervisedProjectionVectorFilter.Parameterizer<V> {
@Override
protected LinearDiscriminantAnalysisFilter<V> makeInstance() {
return new LinearDiscriminantAnalysisFilter<>(tdim);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java
index ea0d4ef2..d5fba25d 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/LngLatToECEFFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,8 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.math.geodesy.EarthModel;
import de.lmu.ifi.dbs.elki.math.geodesy.SphericalVincentyEarthModel;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -39,13 +41,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.uses NumberVector
+ * @apiviz.composedOf EarthModel
+ *
* @param <V> Vector type.
*/
-public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStreamConversionFilter<V, V> {
+public class LngLatToECEFFilter<V extends NumberVector> extends AbstractStreamConversionFilter<V, V> {
/**
* Vector factory to use.
*/
- private NumberVector.Factory<V, ?> factory;
+ private NumberVector.Factory<V> factory;
/**
* Earth model to use.
@@ -69,14 +74,13 @@ public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
@Override
protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
- return new VectorFieldTypeInformation<>(NumberVector.class, 2, 2);
+ return TypeUtil.NUMBER_VECTOR_FIELD_2D;
}
@Override
protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
- VectorFieldTypeInformation<V> vin = (VectorFieldTypeInformation<V>) in;
- factory = (NumberVector.Factory<V, ?>) vin.getFactory();
- return new VectorFieldTypeInformation<>(vin.getFactory(), 3, 3, in.getSerializer());
+ factory = FilterUtil.guessFactory(in);
+ return new VectorFieldTypeInformation<>(factory, 3, 3, in.getSerializer());
}
/**
@@ -88,7 +92,7 @@ public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
*
* @param <V> Vector type
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Earth model to use.
*/
@@ -98,7 +102,7 @@ public class LngLatToECEFFilter<V extends NumberVector<?>> extends AbstractStrea
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<EarthModel> modelP = new ObjectParameter<>(EarthModel.MODEL_ID, EarthModel.class, SphericalVincentyEarthModel.class);
- if (config.grab(modelP)) {
+ if(config.grab(modelP)) {
model = modelP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java
index e6d0d15d..115d77dd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -51,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
*
* @param <V> Vector type
*/
-public class NumberVectorFeatureSelectionFilter<V extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<V, V> {
+public class NumberVectorFeatureSelectionFilter<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> {
/**
* Keeps the selection of the subspace to project onto.
*/
@@ -99,10 +99,8 @@ public class NumberVectorFeatureSelectionFilter<V extends NumberVector<?>> exten
}
/**
- * <p>
* Provides a BitSet with the bits set to true corresponding to the selected
* attributes in {@link Parameterizer#SELECTED_ATTRIBUTES_ID}.
- * </p>
*
* The index in the BitSet is shifted to the left by one, i.e., index 0 in the
* BitSet relates to the first attribute.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java
index 4086270c..dfca33ec 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -32,7 +32,7 @@ import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorStreamConversionFilter;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.utilities.Util;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -51,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
*
* @param <V> vector type
*/
-public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector<?>> extends AbstractVectorStreamConversionFilter<V, V> {
+public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector> extends AbstractVectorStreamConversionFilter<V, V> {
/**
* The selected attributes.
*/
@@ -155,10 +155,10 @@ public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector<?>>
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter kP = new IntParameter(NUMBER_SELECTED_ATTRIBUTES_ID, 1);
- kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ IntParameter kP = new IntParameter(NUMBER_SELECTED_ATTRIBUTES_ID, 1) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
- k = kP.getValue().intValue();
+ k = kP.intValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
if(config.grab(rndP)) {
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java
new file mode 100644
index 00000000..4e5fe9b3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/PerturbationFilter.java
@@ -0,0 +1,436 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.transform;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorConversionFilter;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.MeanVarianceMinMax;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeSetGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.EqualSizeGlobalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
+
+/**
+ * A filter to perturb the values by adding micro-noise.
+ *
+ * The added noise is generated, attribute-wise, by a Gaussian with mean=0 and a
+ * specified standard deviation or by a uniform distribution with a specified
+ * range. The standard deviation or the range can be scaled, attribute-wise, to
+ * a given percentage of the original standard deviation in the data
+ * distribution (assuming a Gaussian distribution there), or to a percentage of
+ * the extension in each attribute ({@code maximumValue - minimumValue}).
+ *
+ * This filter has a potentially wide use but has been implemented for the following publication:
+ *
+ * Reference:
+ * <p>
+ * A. Zimek, R. J. G. B. Campello, J. Sander:</br>
+ * Data Perturbation for Outlier Detection Ensembles.<\br>
+ * In: Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014.
+ * </p>
+ *
+ * @author Arthur Zimek
+ */
+@Title("Data Perturbation for Outlier Detection Ensembles")
+@Description("A filter to perturb a datasset on read by an additive noise component, implemented for use in an outlier ensemble (this reference).")
+@Reference(authors = "A. Zimek, R. J. G. B. Campello, J. Sander",//
+title = "Data Perturbation for Outlier Detection Ensembles", //
+booktitle = "Proc. 26th International Conference on Scientific and Statistical Database Management (SSDBM), Aalborg, Denmark, 2014", //
+url = "http://dx.doi.org/10.1145/2618243.2618257")
+public class PerturbationFilter<V extends NumberVector> extends AbstractVectorConversionFilter<V, V> {
+ /**
+ * Class logger
+ */
+ private static final Logging LOG = Logging.getLogger(PerturbationFilter.class);
+
+ /**
+ * Scaling reference options.
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.exclude
+ */
+ public static enum ScalingReference {
+ UNITCUBE, STDDEV, MINMAX
+ }
+
+ /**
+ * Nature of the noise distribution.
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.exclude
+ */
+ public static enum NoiseDistribution {
+ GAUSSIAN, UNIFORM
+ }
+
+ /**
+ * Which reference to use for scaling the noise.
+ */
+ private ScalingReference scalingreference;
+
+ /**
+ * Nature of the noise distribution.
+ */
+ private NoiseDistribution noisedistribution;
+
+ /**
+ * Random object to generate the attribute-wise seeds for the noise.
+ */
+ private final Random RANDOM;
+
+ /**
+ * Percentage of the variance of the random noise generation, given the
+ * variance of the corresponding attribute in the data.
+ */
+ private double percentage;
+
+ /**
+ * Temporary storage used during initialization.
+ */
+ private MeanVarianceMinMax[] mvs = null;
+
+ /**
+ * Stores the scaling reference in each dimension.
+ */
+ private double[] scalingreferencevalues = new double[0];
+
+ /**
+ * The random objects to generate noise distributions independently for each
+ * attribute.
+ */
+ private Random[] randomPerAttribute = null;
+
+ /**
+ * Stores the maximum in each dimension.
+ */
+ private double[] maxima;
+
+ /**
+ * Stores the minimum in each dimension.
+ */
+ private double[] minima;
+
+ /**
+ * Stores the dimensionality from the preprocessing.
+ */
+ private int dimensionality = 0;
+
+ /**
+ * Constructor.
+ *
+ * @param seed Seed value, may be {@code null} for a random seed.
+ * @param percentage Relative amount of jitter to add
+ * @param scalingreference Scaling reference
+ * @param minima Preset minimum values. May be {@code null}.
+ * @param maxima Preset maximum values. May be {@code null}.
+ * @param noisedistribution Nature of the noise distribution.
+ */
+ public PerturbationFilter(Long seed, double percentage, ScalingReference scalingreference, double[] minima, double[] maxima, NoiseDistribution noisedistribution) {
+ super();
+ this.percentage = percentage;
+ this.scalingreference = scalingreference;
+ this.minima = minima;
+ this.maxima = maxima;
+ this.noisedistribution = noisedistribution;
+ this.RANDOM = (seed == null) ? new Random() : new Random(seed);
+ }
+
+ @Override
+ protected boolean prepareStart(SimpleTypeInformation<V> in) {
+ if(scalingreference == ScalingReference.MINMAX && minima.length != 0 && maxima.length != 0) {
+ dimensionality = minima.length;
+ scalingreferencevalues = new double[dimensionality];
+ randomPerAttribute = new Random[dimensionality];
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = (maxima[d] - minima[d]) * percentage;
+ if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
+ scalingreferencevalues[d] = percentage;
+ }
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ }
+ return false;
+ }
+ if(scalingreference == ScalingReference.UNITCUBE) {
+ return false;
+ }
+ return (scalingreferencevalues.length == 0);
+ }
+
+ @Override
+ protected void prepareProcessInstance(V featureVector) {
+ // First object? Then init. (We didn't have a dimensionality before!)
+ if(mvs == null) {
+ dimensionality = featureVector.getDimensionality();
+ mvs = MeanVarianceMinMax.newArray(dimensionality);
+ }
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ mvs[d].put(featureVector.doubleValue(d));
+ }
+ }
+
+ @Override
+ protected void prepareComplete() {
+ StringBuilder buf = LOG.isDebuggingFine() ? new StringBuilder() : null;
+ scalingreferencevalues = new double[dimensionality];
+ randomPerAttribute = new Random[dimensionality];
+ if(scalingreference == ScalingReference.STDDEV) {
+ if(buf != null) {
+ buf.append("Standard deviation per attribute: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = mvs[d].getSampleStddev() * percentage;
+ if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
+ scalingreferencevalues[d] = percentage;
+ }
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ if(buf != null) {
+ buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
+ }
+ }
+ }
+ else if(scalingreference == ScalingReference.MINMAX && minima.length == 0 && maxima.length == 0) {
+ if(buf != null) {
+ buf.append("extension per attribute: ");
+ }
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = (mvs[d].getMax() - mvs[d].getMin()) * percentage;
+ if(scalingreferencevalues[d] == 0 || Double.isNaN(scalingreferencevalues[d])) {
+ scalingreferencevalues[d] = percentage;
+ }
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ if(buf != null) {
+ buf.append(" ").append(d).append(": ").append(scalingreferencevalues[d] / percentage);
+ }
+ }
+ }
+ mvs = null;
+ if(buf != null) {
+ LOG.debugFine(buf.toString());
+ }
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.NUMBER_VECTOR_FIELD;
+ }
+
+ @Override
+ protected V filterSingleObject(V featureVector) {
+ if(scalingreference == ScalingReference.UNITCUBE && dimensionality == 0) {
+ dimensionality = featureVector.getDimensionality();
+ scalingreferencevalues = new double[dimensionality];
+ randomPerAttribute = new Random[dimensionality];
+ for(int d = 0; d < dimensionality; d++) {
+ scalingreferencevalues[d] = percentage;
+ randomPerAttribute[d] = new Random(RANDOM.nextLong());
+ }
+ }
+ if(scalingreferencevalues.length != featureVector.getDimensionality()) {
+ throw new IllegalArgumentException("FeatureVectors and given Minima/Maxima differ in length.");
+ }
+ double[] values = new double[featureVector.getDimensionality()];
+ for(int d = 0; d < featureVector.getDimensionality(); d++) {
+ if(this.noisedistribution.equals(NoiseDistribution.GAUSSIAN)) {
+ values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextGaussian() * scalingreferencevalues[d];
+ }
+ else if(this.noisedistribution.equals(NoiseDistribution.UNIFORM)) {
+ values[d] = featureVector.doubleValue(d) + randomPerAttribute[d].nextDouble() * scalingreferencevalues[d];
+ }
+ }
+ return factory.newNumberVector(values);
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
+ initializeOutputType(in);
+ return in;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter for minimum.
+ */
+ public static final OptionID MINIMA_ID = new OptionID("perturbationfilter.min", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the minimum values in each dimension assumed as a reference. If no value is specified, the minimum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Parameter for maximum.
+ */
+ public static final OptionID MAXIMA_ID = new OptionID("perturbationfilter.max", "Only used, if " + ScalingReference.MINMAX + " is set as scaling reference: a comma separated concatenation of the maximum values in each dimension assumed as a reference. If no value is specified, the maximum value of the attribute range in this dimension will be taken.");
+
+ /**
+ * Stores the maximum in each dimension.
+ */
+ private double[] maxima = new double[0];
+
+ /**
+ * Stores the minimum in each dimension.
+ */
+ private double[] minima = new double[0];
+
+ /**
+ * Optional parameter to specify a seed for random Gaussian noise
+ * generation. If unused, system time is used as seed.
+ * <p>
+ * Key: {@code -perturbationfilter.seed}
+ * </p>
+ */
+ public static final OptionID SEED_ID = new OptionID("perturbationfilter.seed", "Seed for random noise generation.");
+
+ /**
+ * Seed for randomly shuffling the rows of the database. If null, system
+ * time is used as seed.
+ */
+ protected Long seed = null;
+
+ /**
+ * Optional parameter to specify a percentage of the standard deviation of
+ * the random Gaussian noise generation, given the standard deviation of the
+ * corresponding attribute in the original data distribution (assuming a
+ * Gaussian there).
+ *
+ * <p>
+ * Key: {@code -perturbationfilter.percentage}
+ * </p>
+ * <p>
+ * Default: <code>0.01</code>
+ * </p>
+ * <p>
+ * Constraint: 0 &lt; percentage &leq;1
+ * </p>
+ */
+ public static final OptionID PERCENTAGE_ID = new OptionID("perturbationfilter.percentage", "Percentage of the standard deviation of the random Gaussian noise generation per attribute, given the standard deviation of the corresponding attribute in the original data distribution (assuming a Gaussian distribution there).");
+
+ /**
+ * Parameter for selecting scaling reference.
+ * <p>
+ * Key: {@code -perturbationfilter.scalingreference}
+ * </p>
+ * <p>
+ * Default: <code>ScalingReference.UNITCUBE</code>
+ * </p>
+ */
+ public static final OptionID SCALINGREFERENCE_ID = new OptionID("perturbationfilter.scalingreference", "The reference for scaling the Gaussian noise. Default is " + ScalingReference.UNITCUBE + ", parameter " + PERCENTAGE_ID.getName() + " will then directly define the standard deviation of all noise Gaussians. For options " + ScalingReference.STDDEV + " and " + ScalingReference.MINMAX + ", the percentage of the attributewise standard deviation or extension, repectively, will define the attributewise standard deviation of the noise Gaussians.");
+
+ /**
+ * Parameter for selecting the noise distribution.
+ *
+ * <p>
+ * Key: {@code -perturbationfilter.noisedistribution}
+ * </p>
+ * <p>
+ * Default: <code>NoiseDistribution.UNIFORM</code>
+ * </p>
+ *
+ */
+ public static final OptionID NOISEDISTRIBUTION_ID = new OptionID("perturbationfilter.noisedistribution", "The nature of the noise distribution, default is " + NoiseDistribution.UNIFORM);
+
+ /**
+ * Percentage of the variance of the random Gaussian noise generation or of
+ * the range of the uniform distribution, given the variance of the
+ * corresponding attribute in the data.
+ */
+ protected double percentage;
+
+ /**
+ * The option which reference to use for scaling the noise.
+ */
+ protected ScalingReference scalingreference;
+
+ /**
+ * The option which nature of noise distribution to choose.
+ */
+ protected NoiseDistribution noisedistribution;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ EnumParameter<ScalingReference> scalingReferenceP = new EnumParameter<>(SCALINGREFERENCE_ID, ScalingReference.class, ScalingReference.UNITCUBE);
+ if(config.grab(scalingReferenceP)) {
+ scalingreference = scalingReferenceP.getValue();
+ }
+ EnumParameter<NoiseDistribution> noisedistributionP = new EnumParameter<>(NOISEDISTRIBUTION_ID, NoiseDistribution.class, NoiseDistribution.UNIFORM);
+ if(config.grab(noisedistributionP)) {
+ noisedistribution = noisedistributionP.getValue();
+ }
+ DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, .01);
+ percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(percentageP)) {
+ percentage = percentageP.getValue();
+ }
+ LongParameter seedP = new LongParameter(SEED_ID);
+ seedP.setOptional(true);
+ if(config.grab(seedP)) {
+ seed = seedP.getValue();
+ }
+ DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID);
+ minimaP.setOptional(true);
+ if(config.grab(minimaP)) {
+ minima = ArrayLikeUtil.toPrimitiveDoubleArray(minimaP.getValue());
+ }
+ DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID);
+ maximaP.setOptional(true);
+ if(config.grab(maximaP)) {
+ maxima = ArrayLikeUtil.toPrimitiveDoubleArray(maximaP.getValue());
+ }
+
+ config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP));
+ config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP));
+ }
+
+ @Override
+ protected PerturbationFilter<V> makeInstance() {
+ return new PerturbationFilter<>(seed, percentage, scalingreference, minima, maxima, noisedistribution);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java
index af3f4c6e..e58ea3b0 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/ProjectionFilter.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.filter.transform;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -37,6 +37,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @author Erich Schubert
*
+ * @apiviz.composedOf Projection
+ *
* @param <I> Input type
* @param <O> Output type
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java
index 7082f103..3a81b989 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFilter.java
index 020dcb31..582eba65 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,6 +31,8 @@ import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -46,6 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @apiviz.uses LabelList oneway - - «reads»
* @apiviz.has ClassLabel
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ClassLabelFilter" })
public class ClassLabelFilter implements ObjectFilter {
/**
* The index of the label to be used as class label, null if no class label is
@@ -180,7 +183,7 @@ public class ClassLabelFilter implements ObjectFilter {
}
@Override
- protected Object makeInstance() {
+ protected ClassLabelFilter makeInstance() {
return new ClassLabelFilter(classLabelIndex, classLabelFactory);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFromPatternFilter.java
index 517eb301..3ced4e2f 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ClassLabelFromPatternFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,7 +23,8 @@ package de.lmu.ifi.dbs.elki.datasource.filter;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.BitSet;
+import gnu.trove.list.array.TIntArrayList;
+
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
@@ -31,6 +32,8 @@ import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -42,6 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter;
*
* @author Erich Schubert
*/
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ClassLabelFromPatternFilter" })
public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
/**
* Current meta data
@@ -51,7 +55,7 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
/**
* Bitset of label columns
*/
- BitSet labelcols = new BitSet();
+ TIntArrayList labelcols = new TIntArrayList();
/**
* Label to return for positive matches.
@@ -98,16 +102,16 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
@Override
public BundleMeta getMeta() {
- if (meta == null) {
+ if(meta == null) {
// Rebuild metadata.
BundleMeta origmeta = source.getMeta();
meta = new BundleMeta(origmeta.size() + 1);
meta.add(TypeUtil.SIMPLE_CLASSLABEL);
labelcols.clear();
- for (int i = 0; i < origmeta.size(); i++) {
+ for(int i = 0; i < origmeta.size(); i++) {
final SimpleTypeInformation<?> orig = origmeta.get(i);
- if (TypeUtil.GUESSED_LABEL.isAssignableFromType(orig)) {
- labelcols.set(i);
+ if(TypeUtil.GUESSED_LABEL.isAssignableFromType(orig)) {
+ labelcols.add(i);
}
meta.add(orig);
}
@@ -117,27 +121,27 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
@Override
public Object data(int rnum) {
- if (rnum > 0) {
+ if(rnum > 0) {
return source.data(rnum - 1);
}
- if (meta == null) {
+ if(meta == null) {
getMeta(); // Trigger build
}
- for (int i = labelcols.nextSetBit(0); i >= 0; i = labelcols.nextSetBit(i + 1)) {
- Object o = source.data(i);
- if (o == null) {
+ for(int i = 0; i < labelcols.size(); i++) {
+ Object o = source.data(labelcols.get(i));
+ if(o == null) {
continue;
}
- if (o instanceof LabelList) {
+ if(o instanceof LabelList) {
final LabelList ll = (LabelList) o;
for(int j = 0; j < ll.size(); j++) {
- if (pattern.matcher(ll.get(j)).find()) {
+ if(pattern.matcher(ll.get(j)).find()) {
return positive;
}
}
continue;
}
- if (pattern.matcher(o.toString()).find()) {
+ if(pattern.matcher(o.toString()).find()) {
return positive;
}
}
@@ -147,7 +151,7 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
@Override
public Event nextEvent() {
final Event ev = source.nextEvent();
- if (Event.META_CHANGED.equals(ev)) {
+ if(Event.META_CHANGED.equals(ev)) {
meta = null;
}
return ev;
@@ -191,17 +195,17 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter {
super.makeOptions(config);
PatternParameter patternP = new PatternParameter(PATTERN_ID);
- if (config.grab(patternP)) {
+ if(config.grab(patternP)) {
pattern = patternP.getValue();
}
StringParameter positiveP = new StringParameter(POSITIVE_ID, "positive");
- if (config.grab(positiveP)) {
+ if(config.grab(positiveP)) {
positive = positiveP.getValue();
}
StringParameter negativeP = new StringParameter(NEGATIVE_ID, "negative");
- if (config.grab(negativeP)) {
+ if(config.grab(negativeP)) {
negative = negativeP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ExternalIDFilter.java
index 17538dc9..3947a7cd 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/ExternalIDFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -31,6 +31,8 @@ import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -44,7 +46,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
* @apiviz.uses LabelList oneway - - «reads»
* @apiviz.has ExternalID oneway - - «produces»
*/
-// TODO: use a non-string class for external ids?
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.ExternalIDFilter" })
public class ExternalIDFilter implements ObjectFilter {
/**
* The index of the label to be used as external Id.
@@ -143,7 +145,7 @@ public class ExternalIDFilter implements ObjectFilter {
}
@Override
- protected Object makeInstance() {
+ protected ExternalIDFilter makeInstance() {
return new ExternalIDFilter(externalIdIndex);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java
new file mode 100644
index 00000000..97a5d59d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/MultivariateTimeSeriesFilter.java
@@ -0,0 +1,124 @@
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.FeatureVector;
+import de.lmu.ifi.dbs.elki.data.type.MultivariateSeriesTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractStreamConversionFilter;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * Class to "fold" a flat number vector into a multivariate time series.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class MultivariateTimeSeriesFilter<V extends FeatureVector<?>> extends AbstractStreamConversionFilter<V, V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(MultivariateTimeSeriesFilter.class);
+
+ /**
+ * Number of variates to use.
+ */
+ int variates;
+
+ /**
+ * Constructor.
+ *
+ * @param variates Number of variates.
+ */
+ public MultivariateTimeSeriesFilter(int variates) {
+ super();
+ this.variates = variates;
+ }
+
+ @Override
+ protected V filterSingleObject(V obj) {
+ if(obj.getDimensionality() % variates != 0) {
+ throw new AbortException("Vector length " + obj.getDimensionality() + " not divisible by the number of variates " + variates);
+ }
+ return obj;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> getInputTypeRestriction() {
+ return TypeUtil.FEATURE_VECTORS;
+ }
+
+ @Override
+ protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
+ VectorTypeInformation<V> vin = (VectorTypeInformation<V>) in;
+ return new MultivariateSeriesTypeInformation<>(vin.getFactory(), in.getSerializer(), vin.mindim(), vin.maxdim(), variates);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends FeatureVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter for specifying the number of variates of this series.
+ */
+ public static final OptionID VARIATES_ID = new OptionID("series.variates", "Number of variates this time series has.");
+
+ /**
+ * Number of variates to use.
+ */
+ int variates;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ IntParameter variatesP = new IntParameter(VARIATES_ID)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(variatesP)) {
+ variates = variatesP.intValue();
+ if(variates == 1) {
+ LOG.warning("For univariate series, you should not need to use this filter.");
+ }
+ }
+ }
+
+ @Override
+ protected MultivariateTimeSeriesFilter<V> makeInstance() {
+ return new MultivariateTimeSeriesFilter<>(variates);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SparseVectorFieldFilter.java
index 97960907..2b84f0a6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SparseVectorFieldFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SparseVectorFieldFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,7 +27,10 @@ import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.filter.AbstractConversionFilter;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
/**
* Class that turns sparse float vectors into a proper vector field, by setting
@@ -37,7 +40,8 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
*
* @param <V> Vector type
*/
-public class SparseVectorFieldFilter<V extends SparseNumberVector<?>> extends AbstractConversionFilter<V, V> {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.SparseVectorFieldFilter" })
+public class SparseVectorFieldFilter<V extends SparseNumberVector> extends AbstractConversionFilter<V, V> {
/**
* Class logger.
*/
@@ -79,7 +83,7 @@ public class SparseVectorFieldFilter<V extends SparseNumberVector<?>> extends Ab
@Override
protected SimpleTypeInformation<? super V> convertedType(SimpleTypeInformation<V> in) {
- SparseNumberVector.Factory<V, ?> factory = (SparseNumberVector.Factory<V, ?>) FilterUtil.guessFactory(in);
+ SparseNumberVector.Factory<V> factory = (SparseNumberVector.Factory<V>) FilterUtil.guessFactory(in);
return new VectorFieldTypeInformation<>(factory, maxdim);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SplitNumberVectorFilter.java
index 6ac046ec..81f640df 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/SplitNumberVectorFilter.java
@@ -1,10 +1,10 @@
-package de.lmu.ifi.dbs.elki.datasource.filter;
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,11 +27,13 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.NumberVector.Factory;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.datasource.filter.FilterUtil;
+import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -48,7 +50,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
*
* @param <V> Vector type
*/
-public class SplitNumberVectorFilter<V extends NumberVector<?>> implements ObjectFilter {
+@Alias({ "de.lmu.ifi.dbs.elki.datasource.filter.normalization.SplitNumberVectorFilter" })
+public class SplitNumberVectorFilter<V extends NumberVector> implements ObjectFilter {
/**
* Selected dimensions.
*/
@@ -83,7 +86,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec
// Should be a vector type after above test.
@SuppressWarnings("unchecked")
final VectorFieldTypeInformation<V> vtype = VectorFieldTypeInformation.class.cast(type);
- Factory<V, ?> factory = FilterUtil.guessFactory(vtype);
+ NumberVector.Factory<V> factory = FilterUtil.guessFactory(vtype);
// Get the replacement type informations
VectorFieldTypeInformation<V> type1 = new VectorFieldTypeInformation<>(factory, dims.length);
@@ -144,7 +147,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec
for(int i = 1; i < dims.length; i++) {
m = Math.max(dims[i], m);
}
- return new VectorFieldTypeInformation<>(NumberVector.class, m, Integer.MAX_VALUE);
+ return VectorFieldTypeInformation.typeRequest(NumberVector.class, m, Integer.MAX_VALUE);
}
/**
@@ -154,7 +157,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* The parameter listing the split dimensions.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java
new file mode 100644
index 00000000..d582c8d2
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/typeconversions/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Filters to perform data type conversions.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.datasource.filter.typeconversions; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/package-info.java
index 98ce5b36..24e8aae6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/package-info.java
@@ -5,7 +5,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
index e8201db1..53d814ea 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,9 +23,11 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.io.Tokenizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -45,7 +47,7 @@ public abstract class AbstractParser {
/**
* A pattern defining whitespace.
*/
- public static final String DEFAULT_SEPARATOR = "(\\s+|\\s*[,;]\\s*)";
+ public static final String DEFAULT_SEPARATOR = "\\s*[,;\\s]\\s*";
/**
* A quote pattern
@@ -74,7 +76,7 @@ public abstract class AbstractParser {
/**
* Comment pattern.
*/
- protected Pattern comment = null;
+ private Matcher comment = null;
/**
* String tokenizer.
@@ -91,10 +93,16 @@ public abstract class AbstractParser {
public AbstractParser(Pattern colSep, String quoteChars, Pattern comment) {
super();
this.tokenizer = new Tokenizer(colSep, quoteChars);
- this.comment = comment;
+ this.comment = comment.matcher("");
}
- public static int lengthWithoutLinefeed(String line) {
+ /**
+ * Get the length of the string, not taking trailing linefeeds into account.
+ *
+ * @param line Input line
+ * @return Length
+ */
+ public static int lengthWithoutLinefeed(CharSequence line) {
int length = line.length();
while(length > 0) {
char last = line.charAt(length - 1);
@@ -114,6 +122,26 @@ public abstract class AbstractParser {
protected abstract Logging getLogger();
/**
+ * Cleanup internal data structures.
+ */
+ public void cleanup() {
+ tokenizer.cleanup();
+ if(comment != null) {
+ comment.reset("");
+ }
+ }
+
+ /**
+ * Match a comment line.
+ *
+ * @param line Line to test
+ * @return {@code true} if the line matches the comment pattern.
+ */
+ protected boolean isComment(CharSequence line) {
+ return (comment != null && comment.reset(line).matches());
+ }
+
+ /**
* Returns a string representation of the object.
*
* @return a string representation of the object.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java
index a218638a..9e58aced 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,18 +23,41 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.io.BufferedReader;
+import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.util.regex.Pattern;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.utilities.io.LineReader;
/**
* Base class for streaming parsers.
*
+ * TODO: build our own replacement for {@link BufferedReader}, which recycles
+ * the string builder.
+ *
* @author Erich Schubert
*/
public abstract class AbstractStreamingParser extends AbstractParser implements StreamingParser {
/**
+ * Line reader.
+ */
+ private LineReader reader;
+
+ /**
+ * The buffer we read the data into.
+ */
+ private StringBuilder buf = new StringBuilder();
+
+ /**
+ * Current line number.
+ */
+ private int lineNumber;
+
+ /**
* Constructor.
*
* @param colSep Column separator pattern
@@ -50,4 +73,67 @@ public abstract class AbstractStreamingParser extends AbstractParser implements
this.initStream(in);
return MultipleObjectsBundle.fromStream(this);
}
+
+ @Override
+ public void initStream(InputStream in) {
+ reader = new LineReader(new InputStreamReader(in));
+ lineNumber = 0;
+ }
+
+ /**
+ * Get the current line number.
+ *
+ * @return Current line number
+ */
+ protected int getLineNumber() {
+ return lineNumber;
+ }
+
+ @Override
+ public boolean hasDBIDs() {
+ return false;
+ }
+
+ @Override
+ public boolean assignDBID(DBIDVar var) {
+ var.unset();
+ return false;
+ }
+
+ /**
+ * Read the next line into the tokenizer.
+ *
+ * @return The next line, or {@code null}.
+ */
+ protected boolean nextLineExceptComments() throws IOException {
+ while(reader.readLine(buf.delete(0, buf.length()))) {
+ ++lineNumber;
+ final int len = lengthWithoutLinefeed(buf);
+ if(len > 0 && !isComment(buf)) {
+ tokenizer.initialize(buf, 0, len);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public void cleanup() {
+ super.cleanup();
+ try {
+ if(reader != null) {
+ reader.close();
+ }
+ buf.setLength(0);
+ buf.trimToSize();
+ }
+ catch(IOException e) {
+ // Ignore - maybe already closed.
+ }
+ }
+
+ @Override
+ public MultipleObjectsBundle asMultipleObjectsBundle() {
+ return MultipleObjectsBundle.fromStream(this);
+ }
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
index 50714b81..1c9bf4e9 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -78,22 +78,22 @@ public class ArffParser implements Parser {
/**
* Arff file marker.
*/
- public static final Pattern ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE);
+ public static final Matcher ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE).matcher("");
/**
* Arff attribute declaration marker.
*/
- public static final Pattern ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE);
+ public static final Matcher ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE).matcher("");
/**
* Arff data marker.
*/
- public static final Pattern ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE);
+ public static final Matcher ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE).matcher("");
/**
* Comment pattern.
*/
- public static final Pattern ARFF_COMMENT = Pattern.compile("^\\s*%.*");
+ public static final Matcher ARFF_COMMENT = Pattern.compile("^\\s*%.*").matcher("");
/**
* Pattern to auto-convert columns to external ids.
@@ -108,22 +108,22 @@ public class ArffParser implements Parser {
/**
* Pattern for numeric columns.
*/
- public static final Pattern ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE);
+ public static final Matcher ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE).matcher("");
/**
* Empty line pattern.
*/
- public static final Pattern EMPTY = Pattern.compile("^\\s*$");
+ public static final Matcher EMPTY = Pattern.compile("^\\s*$").matcher("");
/**
* Pattern to recognize external ids.
*/
- Pattern magic_eid;
+ Matcher magic_eid;
/**
* Pattern to recognize class label columns.
*/
- Pattern magic_class;
+ Matcher magic_class;
/**
* (Reused) buffer for building label lists.
@@ -138,8 +138,8 @@ public class ArffParser implements Parser {
*/
public ArffParser(Pattern magic_eid, Pattern magic_class) {
super();
- this.magic_eid = magic_eid;
- this.magic_class = magic_class;
+ this.magic_eid = magic_eid.matcher("");
+ this.magic_class = magic_class.matcher("");
}
/**
@@ -477,11 +477,11 @@ public class ArffParser implements Parser {
throw new AbortException(ARFF_HEADER_RELATION + " not found in file.");
}
// Skip comments and empty lines
- if(ARFF_COMMENT.matcher(line).matches() || EMPTY.matcher(line).matches()) {
+ if(ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) {
continue;
}
// Break on relation statement
- if(ARFF_HEADER_RELATION.matcher(line).matches()) {
+ if(ARFF_HEADER_RELATION.reset(line).matches()) {
break;
}
throw new AbortException("Expected relation declaration: " + line);
@@ -505,15 +505,15 @@ public class ArffParser implements Parser {
throw new AbortException(ARFF_HEADER_DATA + " not found in file.");
}
// Skip comments and empty lines
- if(ARFF_COMMENT.matcher(line).matches() || EMPTY.matcher(line).matches()) {
+ if(ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) {
continue;
}
// Break on data statement to continue
- if(ARFF_HEADER_DATA.matcher(line).matches()) {
+ if(ARFF_HEADER_DATA.reset(line).matches()) {
break;
}
// Expect an attribute specification
- Matcher matcher = ARFF_HEADER_ATTRIBUTE.matcher(line);
+ Matcher matcher = ARFF_HEADER_ATTRIBUTE.reset(line);
if(matcher.matches()) {
String name = matcher.group(1);
if(name.charAt(0) == '\'' && name.charAt(name.length() - 1) == '\'') {
@@ -547,7 +547,7 @@ public class ArffParser implements Parser {
private void processColumnTypes(ArrayList<String> names, ArrayList<String> types, int[] targ, TypeInformation[] etyp, int[] dims) {
int next = 0;
for(int i = 0; i < targ.length; i++) {
- if(magic_eid != null && magic_eid.matcher(names.get(i)).matches()) {
+ if(magic_eid != null && magic_eid.reset(names.get(i)).matches()) {
// Turn into an external ID column.
targ[i] = next;
etyp[next] = TypeUtil.EXTERNALID;
@@ -555,7 +555,7 @@ public class ArffParser implements Parser {
next++;
continue;
}
- else if(magic_class != null && magic_class.matcher(names.get(i)).matches()) {
+ else if(magic_class != null && magic_class.reset(names.get(i)).matches()) {
// Type as ClassLabel
targ[i] = next;
etyp[next] = TypeUtil.CLASSLABEL;
@@ -563,7 +563,7 @@ public class ArffParser implements Parser {
next++;
continue;
}
- else if(ARFF_NUMERIC.matcher(types.get(i)).matches()) {
+ else if(ARFF_NUMERIC.reset(types.get(i)).matches()) {
// Create a number vector field
if(next > 0 && TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[next - 1])) {
targ[i] = next - 1;
@@ -629,6 +629,16 @@ public class ArffParser implements Parser {
}
}
+ @Override
+ public void cleanup() {
+ if (magic_eid != null) {
+ magic_eid.reset("");
+ }
+ if (magic_class != null) {
+ magic_class.reset("");
+ }
+ }
+
/**
* Parameterization class.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
index 26bc38af..cdca7b05 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,27 +23,18 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
+import gnu.trove.list.array.TLongArrayList;
+
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.BitVector;
import de.lmu.ifi.dbs.elki.data.LabelList;
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
/**
- * Provides a parser for parsing one BitVector per line, bits separated by
- * whitespace.
+ * Parser for parsing one BitVector per line, bits separated by whitespace.
* <p/>
* Several labels may be given per BitVector. A label must not be parseable as
* Bit. Lines starting with &quot;#&quot; will be ignored.
@@ -53,14 +44,24 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
* @apiviz.has BitVector
*/
@Title("Bit Vector Label Parser")
-@Description("Parses the following format of lines:\n" + "A single line provides a single BitVector. Bits are separated by whitespace. Any substring not containing whitespace is tried to be read as Bit. If this fails, it will be appended to a label. (Thus, any label must not be parseable as Bit.) Empty lines and lines beginning with \"#\" will be ignored. If any BitVector differs in its dimensionality from other BitVectors, the parse method will fail with an Exception.")
-public class BitVectorLabelParser extends AbstractParser implements Parser {
+@Description("Parses the following format of lines:\n" + //
+"A single line provides a single BitVector. Bits are separated by whitespace. " + //
+"Any substring not containing whitespace is tried to be read as Bit. " + //
+"If this fails, it will be appended to a label. " + //
+"(Thus, any label must not be parseable as Bit.) " + //
+"Empty lines and lines beginning with \"#\" will be ignored.")
+public class BitVectorLabelParser extends NumberVectorLabelParser<BitVector> implements Parser {
/**
* Class logger
*/
private static final Logging LOG = Logging.getLogger(BitVectorLabelParser.class);
/**
+ * Buffer, will be reused.
+ */
+ TLongArrayList buf = new TLongArrayList();
+
+ /**
* Constructor.
*
* @param colSep Column separator
@@ -68,57 +69,37 @@ public class BitVectorLabelParser extends AbstractParser implements Parser {
* @param comment Comment pattern
*/
public BitVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment) {
- super(colSep, quoteChars, comment);
+ super(colSep, quoteChars, comment, null, BitVector.FACTORY);
}
@Override
- public MultipleObjectsBundle parse(InputStream in) {
- BufferedReader reader = new BufferedReader(new InputStreamReader(in));
- int lineNumber = 0;
- int dimensionality = -1;
- List<BitVector> vectors = new ArrayList<>();
- List<LabelList> labels = new ArrayList<>();
- ArrayList<String> ll = new ArrayList<>();
- try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- // Skip empty lines and comments
- if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
- continue;
- }
- BitSet bitSet = new BitSet();
- ll.clear();
- int i = 0;
- for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance()) {
- try {
- if(tokenizer.getLongBase10() > 0) {
- bitSet.set(i);
- }
- ++i;
- }
- catch(NumberFormatException e) {
- ll.add(tokenizer.getSubstring());
- }
+ protected boolean parseLineInternal() {
+ int curdim = 0;
+ for(; tokenizer.valid(); tokenizer.advance()) {
+ try {
+ final int word = curdim >>> 6;
+ final int off = curdim & 0x3F;
+ if(word >= buf.size()) { // Ensure size.
+ buf.add(0L);
}
-
- if(dimensionality < 0) {
- dimensionality = i;
- }
- else if(dimensionality != i) {
- throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ".");
+ if(tokenizer.getLongBase10() > 0) {
+ buf.set(word, buf.get(word) | (1L << off));
}
-
- vectors.add(new BitVector(bitSet, dimensionality));
- labels.add(LabelList.make(ll));
+ ++curdim;
+ }
+ catch(NumberFormatException e) {
+ labels.add(tokenizer.getSubstring());
}
}
- catch(IOException e) {
- throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
+ if(curdim == 0) { // Maybe a label row
+ return false;
}
- return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels);
- }
- protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<>(BitVector.FACTORY, dimensionality);
+ curvec = new BitVector(buf.toArray(), curdim);
+ curlbl = LabelList.make(labels);
+ buf.clear();
+ labels.clear();
+ return true;
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser.java
index 3dd49470..0471ffae 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/CategorialDataAsNumberVectorParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,6 +26,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.util.BitSet;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
@@ -50,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
@Description("This parser expects data in roughly the same format as the NumberVectorLabelParser,\n"//
+ "except that it will enumerate all unique strings to always produce numerical values.\n"//
+ "This way, it can for example handle files that contain lines like 'y,n,y,y,n,y,n'.")
-public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> extends NumberVectorLabelParser<V> {
+public class CategorialDataAsNumberVectorParser<V extends NumberVector> extends NumberVectorLabelParser<V> {
/**
* Logging class.
*/
@@ -69,14 +70,14 @@ public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> exten
/**
* Pattern for NaN values.
*/
- Pattern nanpattern = Pattern.compile("\\?");
+ Matcher nanpattern = Pattern.compile("\\?").matcher("Dummy text");
/**
* Constructor with defaults.
*
* @param factory Vector factory
*/
- public CategorialDataAsNumberVectorParser(NumberVector.Factory<V, ?> factory) {
+ public CategorialDataAsNumberVectorParser(NumberVector.Factory<V> factory) {
this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory);
}
@@ -89,7 +90,7 @@ public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> exten
* @param labelIndices Column indexes that are numeric.
* @param factory Vector factory
*/
- public CategorialDataAsNumberVectorParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
+ public CategorialDataAsNumberVectorParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V> factory) {
super(colSep, quoteChars, comment, labelIndices, factory);
}
@@ -103,14 +104,10 @@ public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> exten
}
@Override
- protected void parseLineInternal(String line) {
- // Split into numerical attributes and labels
- attributes.reset();
- labels.clear();
-
+ protected boolean parseLineInternal() {
int i = 0;
- for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) {
- if(labelIndices == null || !labelIndices.get(i)) {
+ for(/* Initialized by nextLineExceptComments */; tokenizer.valid(); tokenizer.advance(), i++) {
+ if(!isLabelColumn(i)) {
try {
double attribute = tokenizer.getDouble();
attributes.add(attribute);
@@ -118,7 +115,7 @@ public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> exten
}
catch(NumberFormatException e) {
String s = tokenizer.getSubstring();
- if(nanpattern.matcher(s).matches()) {
+ if(nanpattern.reset(s).matches()) {
attributes.add(Double.NaN);
continue;
}
@@ -138,6 +135,9 @@ public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> exten
// Pass outside via class variables
curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER);
curlbl = LabelList.make(labels);
+ attributes.reset();
+ labels.clear();
+ return true;
}
@Override
@@ -152,7 +152,7 @@ public class CategorialDataAsNumberVectorParser<V extends NumberVector<?>> exten
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
+ public static class Parameterizer<V extends NumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
@Override
protected CategorialDataAsNumberVectorParser<V> makeInstance() {
return new CategorialDataAsNumberVectorParser<>(colSep, quoteChars, comment, labelIndices, factory);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ClusteringVectorParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ClusteringVectorParser.java
new file mode 100644
index 00000000..9b812792
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ClusteringVectorParser.java
@@ -0,0 +1,268 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.iterator.TIntIntIterator;
+import gnu.trove.iterator.TIntObjectIterator;
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.map.TIntIntMap;
+import gnu.trove.map.TIntObjectMap;
+import gnu.trove.map.hash.TIntIntHashMap;
+import gnu.trove.map.hash.TIntObjectHashMap;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.LabelList;
+import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
+import de.lmu.ifi.dbs.elki.data.model.Model;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.ClusteringVectorDumper;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+
+/**
+ * Parser for simple clustering results in vector form, as written by
+ * {@link ClusteringVectorDumper}.
+ *
+ * This allows reading the output of <em>multiple</em> clustering runs, and
+ * analyze the results using ELKI algorithm.
+ *
+ * The input format is very simple, each line containing a sequence of cluster
+ * assignments in integer form, and an optional label:
+ *
+ * <pre>
+ * 0 0 1 1 0 First
+ * 0 0 0 1 2 Second
+ * </pre>
+ *
+ * represents two clusterings for 5 objects. The first clustering has two
+ * clusters, the second contains three clusters.
+ *
+ * TODO: this parser currently is quite hacky, and could use a cleanup.
+ *
+ * TODO: support noise, via negative cluster numbers?
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has Clustering
+ */
+public class ClusteringVectorParser extends AbstractStreamingParser {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(ClusteringVectorParser.class);
+
+ /**
+ * Number of different terms observed.
+ */
+ int numterms;
+
+ /**
+ * Metadata.
+ */
+ protected BundleMeta meta;
+
+ /**
+ * Event to report next.
+ */
+ Event nextevent;
+
+ /**
+ * Current clustering.
+ */
+ Clustering<Model> curclu;
+
+ /**
+ * Current labels.
+ */
+ LabelList curlbl;
+
+ /**
+ * Buffers, will be reused.
+ */
+ TIntArrayList buf1 = new TIntArrayList();
+
+ /**
+ * Range of the DBID values.
+ */
+ DBIDRange range = null;
+
+ /**
+ * Buffer for labels.
+ */
+ ArrayList<String> lbl = new ArrayList<>();
+
+ /**
+ * Flag if labels are present.
+ */
+ boolean haslbl;
+
+ /**
+ * Constructor.
+ *
+ * @param colSep Column separator
+ * @param quoteChars Quote character
+ * @param comment Comment pattern
+ */
+ public ClusteringVectorParser(Pattern colSep, String quoteChars, Pattern comment) {
+ super(colSep, quoteChars, comment);
+ }
+
+ @Override
+ public void initStream(InputStream in) {
+ super.initStream(in);
+ range = null; // New range
+ haslbl = false;
+ }
+
+ @Override
+ public Event nextEvent() {
+ if(nextevent != null) {
+ Event ret = nextevent;
+ nextevent = null;
+ return ret;
+ }
+ try {
+ while(nextLineExceptComments()) {
+ buf1.clear();
+ lbl.clear();
+ TIntIntMap csize = new TIntIntHashMap();
+ TIntObjectMap<ModifiableDBIDs> clusters = new TIntObjectHashMap<>();
+ String name = null;
+ for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) {
+ try {
+ int cnum = (int) tokenizer.getLongBase10();
+ buf1.add(cnum);
+ // Update cluster sizes:
+ if(!csize.increment(cnum)) {
+ csize.put(cnum, 1);
+ }
+ }
+ catch(NumberFormatException e) {
+ final String label = tokenizer.getSubstring();
+ lbl.add(label);
+ if(name == null) {
+ name = label;
+ }
+ }
+ }
+ if(name == null) {
+ name = "Cluster";
+ }
+ // Update meta on first record:
+ boolean metaupdate = (range == null);
+ if(range == null) {
+ range = DBIDUtil.generateStaticDBIDRange(buf1.size());
+ }
+ if(buf1.size() != range.size()) {
+ throw new AbortException("Clusterings do not contain the same number of elements!");
+ }
+ // Build clustering to store in the relation.
+ curclu = new Clustering<>(name, name);
+ for(TIntIntIterator iter = csize.iterator(); iter.hasNext();) {
+ iter.advance();
+ if(iter.value() > 0) {
+ clusters.put(iter.key(), DBIDUtil.newArray(iter.value()));
+ }
+ }
+ DBIDArrayIter iter = range.iter();
+ for(int i = 0; i < buf1.size(); i++) {
+ clusters.get(buf1.get(i)).add(iter.seek(i));
+ }
+ for(TIntObjectIterator<ModifiableDBIDs> iter2 = clusters.iterator(); iter2.hasNext();) {
+ iter2.advance();
+ curclu.addToplevelCluster(new Cluster<Model>(iter2.value(), ClusterModel.CLUSTER));
+ }
+ // Label handling.
+ if(!haslbl && lbl.size() > 0) {
+ haslbl = true;
+ metaupdate = true;
+ }
+ curlbl = LabelList.make(lbl);
+ if(metaupdate) {
+ nextevent = Event.NEXT_OBJECT; // Force a meta update.
+ return Event.META_CHANGED;
+ }
+ return Event.NEXT_OBJECT;
+ }
+ return Event.END_OF_STREAM;
+ }
+ catch(IOException e) {
+ throw new IllegalArgumentException("Error while parsing line " + getLineNumber() + ".");
+ }
+ }
+
+ @Override
+ public Object data(int rnum) {
+ if(rnum == 0) {
+ return curclu;
+ }
+ if(rnum == 1) {
+ return curlbl;
+ }
+ throw new ArrayIndexOutOfBoundsException();
+ }
+
+ @Override
+ public BundleMeta getMeta() {
+ if(meta == null) {
+ meta = new BundleMeta(haslbl ? 2 : 1);
+ meta.add(new SimpleTypeInformation<>(Clustering.class, "Clusters"));
+ if(haslbl) {
+ meta.add(TypeUtil.LABELLIST);
+ }
+ }
+ return meta;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractStreamingParser.Parameterizer {
+ @Override
+ protected ClusteringVectorParser makeInstance() {
+ return new ClusteringVectorParser(colSep, quoteChars, comment);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
deleted file mode 100644
index bf84f2ce..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
+++ /dev/null
@@ -1,104 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.parser;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2011
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import java.util.BitSet;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.data.DoubleVector;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-
-/**
- * <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
- * </p>
- * <p>
- * Several labels may be given per point. A label must not be parseable as
- * double. Lines starting with &quot;#&quot; will be ignored.
- * </p>
- * <p/>
- * <p>
- * An index can be specified to identify an entry to be treated as class label.
- * This index counts all entries (numeric and labels as well) starting with 0.
- * </p>
- *
- * @author Arthur Zimek
- *
- * @apiviz.has DoubleVector
- *
- * @deprecated Use NumberVectorLabelParser instead, which defaults to
- * DoubleVector.
- */
-@Deprecated
-public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVector> {
- /**
- * Class logger
- */
- private static final Logging LOG = Logging.getLogger(DoubleVectorLabelParser.class);
-
- /**
- * Constructor.
- *
- * @param colSep Column separator
- * @param quoteChars Quotation character
- * @param comment Comment pattern
- * @param labelIndices Indices to use as labels
- */
- public DoubleVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices) {
- super(colSep, quoteChars, comment, labelIndices, DoubleVector.FACTORY);
- }
-
- /**
- * Constructor with default values.
- */
- public DoubleVectorLabelParser() {
- this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), new BitSet());
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer extends NumberVectorLabelParser.Parameterizer<DoubleVector> {
- @Override
- protected void getFactory(Parameterization config) {
- // Do nothing: not used
- }
-
- @Override
- protected DoubleVectorLabelParser makeInstance() {
- return new DoubleVectorLabelParser(colSep, quoteChars, comment, labelIndices);
- }
- }
-}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
deleted file mode 100644
index 6d800cd8..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
+++ /dev/null
@@ -1,99 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.parser;
-/*
-This file is part of ELKI:
-Environment for Developing KDD-Applications Supported by Index-Structures
-
-Copyright (C) 2013
-Ludwig-Maximilians-Universität München
-Lehr- und Forschungseinheit für Datenbanksysteme
-ELKI Development Team
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-import java.util.BitSet;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.data.FloatVector;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-
-/**
- * <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
- * </p>
- * <p>
- * Numerical values in a line will be parsed as double values but used in float
- * precision only.
- * </p>
- * <p>
- * Several labels may be given per point. A label must not be parseable as
- * double. Lines starting with &quot;#&quot; will be ignored.
- * </p>
- * <p/>
- * <p>
- * An index can be specified to identify an entry to be treated as class label.
- * This index counts all entries (numeric and labels as well) starting with 0.
- * </p>
- *
- * @author Arthur Zimek
- *
- * @apiviz.has FloatVector
- *
- * @deprecated Use NumberVectorLabelParser instead, and use vector type FloatVector.
- */
-@Deprecated
-public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector> {
- /**
- * Class logger.
- */
- private static final Logging LOG = Logging.getLogger(FloatVectorLabelParser.class);
-
- /**
- * Constructor.
- *
- * @param colSep Column separator
- * @param quoteChars Quotation character
- * @param comment Comment pattern
- * @param labelIndices Indices to use as labels
- */
- public FloatVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices) {
- super(colSep, quoteChars, comment, labelIndices, FloatVector.FACTORY);
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer extends NumberVectorLabelParser.Parameterizer<FloatVector> {
- @Override
- protected void getFactory(Parameterization config) {
- // Do nothing: not used
- }
-
- @Override
- protected FloatVectorLabelParser makeInstance() {
- return new FloatVectorLabelParser(colSep, quoteChars, comment, labelIndices);
- }
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java
new file mode 100644
index 00000000..3a7ac44a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java
@@ -0,0 +1,151 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.LabelList;
+import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
+import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Parser to read libSVM format files.
+ *
+ *
+ * The format of libSVM is roughly specified in the README given:
+ *
+ * <pre>
+ * &lt;label&gt; &lt;index1&gt;:&lt;value1&gt; &lt;index2&gt;:&lt;value2&gt; ...
+ * </pre>
+ *
+ * i.e. a mandatory integer class label in the beginning followed by a classic
+ * sparse vector representation of the data. indexes are integers, starting at 1
+ * (Note that ELKI uses 0-based indexing, so we will map these to index-1) to
+ * not always have a constant-0 dimension 0.
+ *
+ * The FAQ states that you can also put comments into the file, separated by a
+ * dash: <tt>#</tt>, but they must not contain colons and are not officially
+ * supported. ELKI will simply stop parsing a line when encountering a
+ * <tt>#</tt>.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class LibSVMFormatParser<V extends SparseNumberVector> extends SparseNumberVectorLabelParser<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(LibSVMFormatParser.class);
+
+ /**
+ * LibSVM uses whitespace and colons for separation.
+ */
+ public static final Pattern WHITESPACE_PATTERN = Pattern.compile("(\\s+|:)");
+
+ /**
+ * Comment pattern.
+ */
+ public static final Pattern COMMENT_PATTERN = Pattern.compile("#");
+
+ /**
+ * Constructor.
+ *
+ * @param factory Vector factory
+ */
+ public LibSVMFormatParser(SparseNumberVector.Factory<V> factory) {
+ super(WHITESPACE_PATTERN, null, COMMENT_PATTERN, null, factory);
+ }
+
+ @Override
+ protected boolean parseLineInternal() {
+ /* tokenizer initialized by nextLineExceptComments() */
+ int thismax = 0;
+
+ // TODO: rely on the string being numeric for performance
+ // But it might be missing sometimes, or "?"
+ labels.add(tokenizer.getSubstring());
+ tokenizer.advance();
+ haslabels = true; // libSVM always has labels.
+
+ while(tokenizer.valid()) {
+ try {
+ int index = (int) tokenizer.getLongBase10();
+ tokenizer.advance();
+ double attribute = tokenizer.getDouble();
+ tokenizer.advance();
+ thismax = Math.max(thismax, index + 1);
+ values.put(index, attribute);
+ }
+ catch(NumberFormatException e) {
+ String comment = tokenizer.getSubstring();
+ if(comment.charAt(0) == '#') {
+ break;
+ }
+ throw new RuntimeException("Parsing error in line " + getLineNumber() + ": expected data, got " + comment);
+ }
+ }
+ curvec = sparsefactory.newNumberVector(values, thismax);
+ curlbl = LabelList.make(labels);
+ values.clear();
+ labels.clear();
+ return true;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
+ @Override
+ protected void getFactory(Parameterization config) {
+ ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ if(config.grab(factoryP)) {
+ factory = factoryP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // Avoid additional options: super.makeOptions(config);
+ getFactory(config);
+ }
+
+ @Override
+ protected LibSVMFormatParser<V> makeInstance() {
+ return new LibSVMFormatParser<>((SparseNumberVector.Factory<V>) factory);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index 3fe4af09..e09dcd22 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,13 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
import gnu.trove.list.array.TDoubleArrayList;
-import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.BitSet;
-import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
@@ -46,6 +43,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.hash.Unique;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -53,18 +51,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
- * <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
- * </p>
- * <p>
+ * Parser for a simple CSV type of format, with columns separated by the given
+ * pattern (default: whitespace).
+ *
* Several labels may be given per point. A label must not be parseable as
* double. Lines starting with &quot;#&quot; will be ignored.
- * </p>
- * <p>
+ *
* An index can be specified to identify an entry to be treated as class label.
* This index counts all entries (numeric and labels as well) starting with 0.
- * </p>
*
* @author Arthur Zimek
*
@@ -73,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> the type of NumberVector used
*/
-public class NumberVectorLabelParser<V extends NumberVector<?>> extends AbstractStreamingParser {
+public class NumberVectorLabelParser<V extends NumberVector> extends AbstractStreamingParser {
/**
* Logging class.
*/
@@ -82,22 +76,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* Keeps the indices of the attributes to be treated as a string label.
*/
- protected BitSet labelIndices;
+ private BitSet labelIndices;
/**
* Vector factory class.
*/
- protected NumberVector.Factory<V, ?> factory;
-
- /**
- * Buffer reader.
- */
- private BufferedReader reader;
-
- /**
- * Current line number.
- */
- protected int lineNumber;
+ protected NumberVector.Factory<V> factory;
/**
* Dimensionality reported.
@@ -115,11 +99,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
protected List<String> columnnames = null;
/**
- * Bitset to indicate which columns are not numeric.
- */
- protected BitSet labelcolumns = null;
-
- /**
* Whether or not the data set has labels.
*/
protected boolean haslabels = false;
@@ -147,7 +126,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* For String unification.
*/
- HashMap<String, String> unique = new HashMap<>();
+ Unique<String> unique = new Unique<>();
/**
* Event to report next.
@@ -159,7 +138,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*
* @param factory Vector factory
*/
- public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) {
+ public NumberVectorLabelParser(NumberVector.Factory<V> factory) {
this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory);
}
@@ -172,24 +151,30 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param labelIndices Column indexes that are numeric.
* @param factory Vector factory
*/
- public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
+ public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V> factory) {
super(colSep, quoteChars, comment);
this.labelIndices = labelIndices;
this.factory = factory;
}
+ /**
+ * Test if the current column is marked as label column.
+ *
+ * @param col Column number
+ * @return {@code true} when a label column.
+ */
+ protected boolean isLabelColumn(int col) {
+ return labelIndices != null && labelIndices.get(col);
+ }
+
@Override
public void initStream(InputStream in) {
- reader = new BufferedReader(new InputStreamReader(in));
- lineNumber = 1;
+ super.initStream(in);
mindim = Integer.MAX_VALUE;
maxdim = 0;
columnnames = null;
haslabels = false;
- labelcolumns = new BitSet();
- if(labelIndices != null) {
- labelcolumns.or(labelIndices);
- }
+ nextevent = null;
}
@Override
@@ -205,41 +190,37 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
return ret;
}
try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- // Skip empty lines and comments
- if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
- continue;
- }
- parseLineInternal(line);
- // Maybe a header column?
- if(curvec == null) {
- continue;
- }
- final int curdim = curvec.getDimensionality();
- if(curdim > maxdim || mindim > curdim) {
- mindim = Math.min(mindim, curdim);
- maxdim = Math.max(maxdim, curdim);
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
- }
- else if(curlbl != null && meta != null && meta.size() == 1) {
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
+ while(nextLineExceptComments()) {
+ if(parseLineInternal()) {
+ final int curdim = curvec.getDimensionality();
+ if(curdim > maxdim || mindim > curdim) {
+ mindim = (curdim < mindim) ? curdim : mindim;
+ maxdim = (curdim > maxdim) ? curdim : maxdim;
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ }
+ else if(curlbl != null && meta != null && meta.size() == 1) {
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ }
+ return Event.NEXT_OBJECT;
}
- return Event.NEXT_OBJECT;
}
- reader.close();
- reader = null;
- unique.clear();
return Event.END_OF_STREAM;
}
catch(IOException e) {
- throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
+ throw new IllegalArgumentException("Error while parsing line " + getLineNumber() + ".");
}
}
+ @Override
+ public void cleanup() {
+ super.cleanup();
+ unique.clear();
+ }
+
/**
* Update the meta element.
*/
@@ -257,13 +238,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
@Override
public Object data(int rnum) {
- if(rnum == 0) {
- return curvec;
- }
- if(rnum == 1) {
- return curlbl;
+ if(rnum > 1) {
+ throw new ArrayIndexOutOfBoundsException();
}
- throw new ArrayIndexOutOfBoundsException();
+ return (rnum == 0) ? curvec : curlbl;
}
/**
@@ -271,16 +249,14 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* as well as block parsing. This saves the building of meta data for each
* line.
*
- * @param line Line to process
+ * @return {@code true} when a valid line was read, {@code false} on a label
+ * row.
*/
- protected void parseLineInternal(String line) {
- attributes.reset();
- labels.clear();
-
+ protected boolean parseLineInternal() {
// Split into numerical attributes and labels
int i = 0;
- for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) {
- if(labelIndices == null || !labelIndices.get(i)) {
+ for(/* initialized by nextLineExceptComents()! */; tokenizer.valid(); tokenizer.advance(), i++) {
+ if(!isLabelColumn(i) && !tokenizer.isQuoted()) {
try {
double attribute = tokenizer.getDouble();
attributes.add(attribute);
@@ -288,34 +264,30 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
}
catch(NumberFormatException e) {
// Ignore attempt, add to labels below.
- labelcolumns.set(i);
}
}
// Else: labels.
- haslabels = true;
- final String lbl = tokenizer.getSubstring();
- String u = unique.get(lbl);
- if(u == null) {
- u = lbl;
- unique.put(u, u);
+ String lbl = tokenizer.getStrippedSubstring();
+ if(lbl.length() > 0) {
+ haslabels = true;
+ lbl = unique.addOrGet(lbl);
+ labels.add(lbl);
}
- labels.add(u);
}
// Maybe a label row?
- if(lineNumber == 1 && attributes.size() == 0) {
+ if(getLineNumber() == 1 && attributes.size() == 0) {
columnnames = new ArrayList<>(labels);
- labelcolumns.clear();
- if(labelIndices != null) {
- labelcolumns.or(labelIndices);
- }
+ haslabels = false;
curvec = null;
curlbl = null;
- haslabels = false;
- return;
+ return false;
}
// Pass outside via class variables
curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER);
curlbl = LabelList.make(labels);
+ attributes.reset();
+ labels.clear();
+ return true;
}
/**
@@ -338,28 +310,28 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @return Prototype object
*/
SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
+ if(mindim > maxdim) {
+ throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
+ }
if(mindim == maxdim) {
String[] colnames = null;
if(columnnames != null) {
- if(columnnames.size() - labelcolumns.cardinality() == mindim) {
- colnames = new String[mindim];
- for(int i = 0, j = 0; i < columnnames.size(); i++) {
- if(!labelcolumns.get(i)) {
- colnames[j] = columnnames.get(i);
- j++;
- }
+ colnames = new String[mindim];
+ int j = 0;
+ for(int i = 0; i < mindim; i++) {
+ if(!isLabelColumn(i)) {
+ colnames[j] = columnnames.get(i);
+ j++;
}
}
+ if(j == mindim) {
+ colnames = null; // Did not work
+ }
}
return new VectorFieldTypeInformation<>(factory, mindim, colnames);
}
- else if(mindim < maxdim) {
- // Variable dimensionality - return non-vector field type
- return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
- }
- else {
- throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
- }
+ // Variable dimensionality - return non-vector field type
+ return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim);
}
@Override
@@ -374,7 +346,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParser.Parameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParser.Parameterizer {
/**
* A comma separated list of the indices of labels (may be numeric),
* counting whitespace separated entries in a line starting with 0. The
@@ -402,7 +374,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* Factory object.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
@Override
protected void makeOptions(Parameterization config) {
@@ -417,7 +389,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param config Parameterization
*/
protected void getFactory(Parameterization config) {
- ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
+ ObjectParameter<NumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
if(config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
index a0b4e573..df1dc5f6 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
import java.io.InputStream;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
/**
* A Parser shall provide a ParsingResult by parsing an InputStream.
@@ -37,7 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
* @apiviz.uses InputStream
* @apiviz.has MultipleObjectsBundle oneway - - «create»
*/
-public interface Parser extends Parameterizable {
+public interface Parser {
/**
* Returns a list of the objects parsed from the specified input stream.
*
@@ -45,4 +44,9 @@ public interface Parser extends Parameterizable {
* @return a list containing those objects parsed from the input stream
*/
MultipleObjectsBundle parse(InputStream in);
+
+ /**
+ * Perform cleanup operations after parsing.
+ */
+ void cleanup();
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
index 457a161b..0283a791 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -95,7 +95,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
try {
for(String line; (line = reader.readLine()) != null; lineNumber++) {
// Skip empty lines and comments
- if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ if(line.length() <= 0 || isComment(line)) {
continue;
}
Object[] objs = parseLine(line);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimpleTransactionParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimpleTransactionParser.java
new file mode 100644
index 00000000..a8ffef4b
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimpleTransactionParser.java
@@ -0,0 +1,200 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.iterator.TObjectIntIterator;
+import gnu.trove.list.array.TLongArrayList;
+import gnu.trove.map.TObjectIntMap;
+import gnu.trove.map.hash.TObjectIntHashMap;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.BitVector;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+
+/**
+ * Simple parser for transactional data, such as market baskets.
+ *
+ * To keep the input format simple and readable, all tokens are assumed to be of
+ * text and separated by whitespace, and each transaction is on a separate line.
+ *
+ * An example file containing two transactions looks like this
+ *
+ * <pre>
+ * bread butter milk
+ * paste tomato basil
+ * </pre>
+ *
+ * TODO: add a parameter to e.g. use the first or last entry as labels instead
+ * of tokens.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has BitVector
+ */
+public class SimpleTransactionParser extends AbstractStreamingParser {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(SimpleTransactionParser.class);
+
+ /**
+ * Number of different terms observed.
+ */
+ int numterms;
+
+ /**
+ * Map.
+ */
+ TObjectIntMap<String> keymap;
+
+ /**
+ * Metadata.
+ */
+ protected BundleMeta meta;
+
+ /**
+ * Event to report next.
+ */
+ Event nextevent;
+
+ /**
+ * Current vector.
+ */
+ BitVector curvec;
+
+ /**
+ * Buffer, will be reused.
+ */
+ TLongArrayList buf = new TLongArrayList();
+
+ /**
+ * Constructor.
+ *
+ * @param colSep Column separator
+ * @param quoteChars Quote character
+ * @param comment Comment pattern
+ */
+ public SimpleTransactionParser(Pattern colSep, String quoteChars, Pattern comment) {
+ super(colSep, quoteChars, comment);
+ keymap = new TObjectIntHashMap<>(1001, .5f, -1);
+ }
+
+ @Override
+ public void initStream(InputStream in) {
+ super.initStream(in);
+ nextevent = Event.META_CHANGED; // Initial event.
+ }
+
+ @Override
+ public Event nextEvent() {
+ if(nextevent != null) {
+ Event ret = nextevent;
+ nextevent = null;
+ return ret;
+ }
+ try {
+ while(nextLineExceptComments()) {
+ // Don't reuse bitsets, will not be copied by BitVector constructor.
+ buf.clear();
+ for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) {
+ String token = tokenizer.getSubstring();
+ int t = keymap.get(token);
+ if(t < 0) {
+ t = keymap.size();
+ keymap.put(token, t);
+ }
+ final int word = t >>> 6;
+ final int off = t & 0x3F;
+ while(word >= buf.size()) { // Ensure size.
+ buf.add(0L);
+ }
+ buf.set(word, buf.get(word) | (1L << off));
+ }
+ curvec = new BitVector(buf.toArray(), keymap.size());
+ return Event.NEXT_OBJECT;
+ }
+ nextevent = Event.END_OF_STREAM;
+ // Construct final metadata:
+ meta = new BundleMeta(1);
+ String[] colnames = new String[keymap.size()];
+ for(TObjectIntIterator<String> iter = keymap.iterator(); iter.hasNext();) {
+ iter.advance();
+ colnames[iter.value()] = iter.key();
+ }
+ meta.add(new VectorFieldTypeInformation<>(BitVector.FACTORY, colnames.length, colnames));
+ return Event.META_CHANGED; // Force a final meta update.
+ }
+ catch(IOException e) {
+ throw new IllegalArgumentException("Error while parsing line " + getLineNumber() + ".");
+ }
+ }
+
+ @Override
+ public void cleanup() {
+ super.cleanup();
+ curvec = null;
+ }
+
+ @Override
+ public Object data(int rnum) {
+ if(rnum == 0) {
+ return curvec;
+ }
+ throw new ArrayIndexOutOfBoundsException();
+ }
+
+ @Override
+ public BundleMeta getMeta() {
+ if(meta == null) {
+ meta = new BundleMeta(1);
+ meta.add(new VectorTypeInformation<>(BitVector.FACTORY, BitVector.SHORT_SERIALIZER, 0, numterms));
+ }
+ return meta;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractStreamingParser.Parameterizer {
+ @Override
+ protected SimpleTransactionParser makeInstance() {
+ return new SimpleTransactionParser(colSep, quoteChars, comment);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
deleted file mode 100644
index 06925e67..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
+++ /dev/null
@@ -1,143 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.parser;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.data.BitVector;
-import de.lmu.ifi.dbs.elki.data.LabelList;
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-
-/**
- * Provides a parser for parsing one sparse BitVector per line, where the
- * indices of the one-bits are separated by whitespace. The first index starts
- * with zero.
- * <p/>
- * Several labels may be given per BitVector, a label must not be parseable as
- * an Integer. Lines starting with &quot;#&quot; will be ignored.
- *
- * @author Elke Achtert
- *
- * @apiviz.has BitVector
- */
-@Title("Sparse Bit Vector Label Parser")
-@Description("Parser for the lines of the following format:\n" + "A single line provides a single sparse BitVector. The indices of the one-bits are " + "separated by whitespace. The first index starts with zero. Any substring not containing whitespace is tried to be read as an Integer. " + "If this fails, it will be appended to a label. (Thus, any label must not be parseable as an Integer.) " + "Empty lines and lines beginning with \"#\" will be ignored.")
-public class SparseBitVectorLabelParser extends AbstractParser implements Parser {
- /**
- * Class logger
- */
- private static final Logging LOG = Logging.getLogger(SparseBitVectorLabelParser.class);
-
- /**
- * Constructor.
- *
- * @param colSep Column separator
- * @param quoteChars Quotation character
- * @param comment Comment pattern
- */
- public SparseBitVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment) {
- super(colSep, quoteChars, comment);
- }
-
- @Override
- public MultipleObjectsBundle parse(InputStream in) {
- BufferedReader reader = new BufferedReader(new InputStreamReader(in));
- int lineNumber = 0;
- int dimensionality = -1;
- List<BitVector> vectors = new ArrayList<>();
- List<LabelList> lblc = new ArrayList<>();
- try {
- List<BitSet> bitSets = new ArrayList<>();
- List<LabelList> allLabels = new ArrayList<>();
- ArrayList<String> labels = new ArrayList<>();
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- // Skip empty lines and comments
- if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
- continue;
- }
- BitSet bitSet = new BitSet();
- labels.clear();
-
- for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance()) {
- try {
- int index = (int) tokenizer.getLongBase10();
- bitSet.set(index);
- dimensionality = Math.max(dimensionality, index);
- }
- catch(NumberFormatException e) {
- labels.add(tokenizer.getSubstring());
- }
- }
-
- bitSets.add(bitSet);
- allLabels.add(LabelList.make(labels));
- }
-
- ++dimensionality;
- for(int i = 0; i < bitSets.size(); i++) {
- vectors.add(new BitVector(bitSets.get(i), dimensionality));
- lblc.add(allLabels.get(i));
- }
- }
- catch(IOException e) {
- throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
- }
- return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, lblc);
- }
-
- protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<>(BitVector.FACTORY, dimensionality);
- }
-
- @Override
- protected Logging getLogger() {
- return LOG;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer extends AbstractParser.Parameterizer {
- @Override
- protected SparseBitVectorLabelParser makeInstance() {
- return new SparseBitVectorLabelParser(colSep, quoteChars, comment);
- }
- }
-}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
deleted file mode 100644
index 87efbf55..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
+++ /dev/null
@@ -1,97 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.parser;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2012
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import java.util.BitSet;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-
-/**
- * <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
- * </p>
- * <p>
- * Several labels may be given per point. A label must not be parseable as
- * double. Lines starting with &quot;#&quot; will be ignored.
- * </p>
- * <p>
- * A line is expected in the following format: The first entry of each line is
- * the number of attributes with coordinate value not zero. Subsequent entries
- * are of the form <code>index value </code> each, where index is the number of
- * the corresponding dimension, and value is the value of the corresponding
- * attribute. A complet line then could look like this:
- *
- * <pre>
- * 3 7 12.34 8 56.78 11 1.234 objectlabel
- * </pre>
- *
- * where <code>3</code> indicates there are three attributes set,
- * <code>7,8,11</code> are the attributes indexes and there is a non-numerical
- * object label.
- * </p>
- * <p>
- * An index can be specified to identify an entry to be treated as class label.
- * This index counts all entries (numeric and labels as well) starting with 0.
- * </p>
- *
- * @author Arthur Zimek
- *
- * @apiviz.has SparseFloatVector
- *
- * @deprecated Use {@link SparseNumberVectorLabelParser} instead!
- */
-@Title("Sparse Float Vector Label Parser")
-@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored.")
-@Deprecated
-public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser<SparseFloatVector> {
- /**
- * Constructor.
- *
- * @param colSep Column separator
- * @param quoteChars Quotation character
- * @param comment Comment pattern
- * @param labelIndices Indices to use as labels
- */
- public SparseFloatVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices) {
- super(colSep, quoteChars, comment, labelIndices, SparseFloatVector.FACTORY);
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer extends SparseNumberVectorLabelParser.Parameterizer<SparseFloatVector> {
- @Override
- protected SparseFloatVectorLabelParser makeInstance() {
- return new SparseFloatVectorLabelParser(colSep, quoteChars, comment, labelIndices);
- }
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
index 902d59a9..f9b34c92 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -44,8 +44,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
+ * Parser for parsing one point per line, attributes separated by whitespace.
* </p>
* <p>
* Several labels may be given per point. A label must not be parseable as
@@ -56,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* the number of attributes with coordinate value not zero. Subsequent entries
* are of the form <code>index value </code> each, where index is the number of
* the corresponding dimension, and value is the value of the corresponding
- * attribute. A complet line then could look like this:
+ * attribute. A complete line then could look like this:
*
* <pre>
* 3 7 12.34 8 56.78 11 1.234 objectlabel
@@ -77,10 +76,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> vector type
*/
-// FIXME: Maxdim!
@Title("Sparse Vector Label Parser")
@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored.")
-public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> extends NumberVectorLabelParser<V> {
+public class SparseNumberVectorLabelParser<V extends SparseNumberVector> extends NumberVectorLabelParser<V> {
/**
* Class logger.
*/
@@ -89,7 +87,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
/**
* Same as {@link #factory}, but subtype.
*/
- private SparseNumberVector.Factory<V, ?> sparsefactory;
+ protected SparseNumberVector.Factory<V> sparsefactory;
/**
* (Reused) set of values for the number vector.
@@ -110,18 +108,17 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
* @param labelIndices Indices to use as labels
* @param factory Vector factory
*/
- public SparseNumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ public SparseNumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V> factory) {
super(colSep, quoteChars, comment, labelIndices, factory);
this.sparsefactory = factory;
}
@Override
- protected void parseLineInternal(String line) {
- tokenizer.initialize(line, 0, lengthWithoutLinefeed(line));
+ protected boolean parseLineInternal() {
+ /* tokenizer initialized by nextLineExceptComments() */
int cardinality = (int) tokenizer.getLongBase10();
+ tokenizer.advance();
- values.clear();
- labels.clear();
int thismax = 0;
while(tokenizer.valid()) {
@@ -130,7 +127,10 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
int index = (int) tokenizer.getLongBase10();
tokenizer.advance();
// Respect labelIndices.
- if(labelIndices == null || !labelIndices.get(index)) {
+ if(!isLabelColumn(index)) {
+ if(!tokenizer.valid()) {
+ throw new AbortException("Parser expected double value, but line ended too early: " + getLineNumber());
+ }
double attribute = tokenizer.getDouble();
thismax = Math.max(thismax, index + 1);
values.put(index, attribute);
@@ -149,6 +149,9 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
}
curvec = sparsefactory.newNumberVector(values, thismax);
curlbl = LabelList.make(labels);
+ values.clear();
+ labels.clear();
+ return true;
}
@Override
@@ -157,7 +160,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
return new VectorFieldTypeInformation<>(factory, mindim);
}
else if(mindim < maxdim) {
- return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
+ return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim);
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@@ -174,10 +177,10 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
+ public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
if(config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -185,7 +188,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
@Override
protected SparseNumberVectorLabelParser<V> makeInstance() {
- return new SparseNumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
+ return new SparseNumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, (SparseNumberVector.Factory<V>) factory);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java
index 73d38e3c..23326a32 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java
@@ -1,14 +1,10 @@
package de.lmu.ifi.dbs.elki.datasource.parser;
-import java.io.InputStream;
-
-import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,6 +23,10 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.io.InputStream;
+
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
+
/**
* Interface for streaming parsers, that may be much more efficient in
* combination with filters.
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java
index 6541b881..6e5773ee 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,6 +29,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
@@ -57,7 +58,7 @@ public class StringParser implements Parser {
/**
* Comment pattern.
*/
- Pattern comment;
+ Matcher comment;
/**
* Flag to trim whitespace.
@@ -72,7 +73,7 @@ public class StringParser implements Parser {
*/
public StringParser(Pattern comment, boolean trimWhitespace) {
super();
- this.comment = comment;
+ this.comment = (comment != null) ? comment.matcher("") : null;
this.trimWhitespace = trimWhitespace;
}
@@ -84,9 +85,9 @@ public class StringParser implements Parser {
List<LabelList> labels = new ArrayList<>();
ArrayList<String> ll = new ArrayList<>(1);
try {
- for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ for(String line; (line = reader.readLine()) != null; lineNumber++) {
// Skip empty lines and comments
- if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ if(line.length() <= 0 || (comment != null && comment.reset(line).matches())) {
continue;
}
final String val = trimWhitespace ? line.trim() : line;
@@ -95,12 +96,18 @@ public class StringParser implements Parser {
ll.add(val);
labels.add(LabelList.make(ll));
}
- } catch (IOException e) {
+ }
+ catch(IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
return MultipleObjectsBundle.makeSimple(TypeUtil.STRING, data, TypeUtil.LABELLIST, labels);
}
+ @Override
+ public void cleanup() {
+ comment.reset("");
+ }
+
/**
* Parameterization class.
*
@@ -129,12 +136,12 @@ public class StringParser implements Parser {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
PatternParameter commentP = new PatternParameter(AbstractParser.Parameterizer.COMMENT_ID, "^\\s*#.*$");
- if (config.grab(commentP)) {
+ if(config.grab(commentP)) {
comment = commentP.getValue();
}
Flag trimP = new Flag(TRIM_ID);
- if (config.grab(trimP)) {
+ if(config.grab(trimP)) {
trimWhitespace = trimP.isTrue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
index f0ccbf50..c750b687 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -51,13 +51,16 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* A parser to load term frequency data, which essentially are sparse vectors
* with text keys.
*
+ * If your data does not contain frequencies, you can maybe use
+ * {@link SimpleTransactionParser} instead.
+ *
* @author Erich Schubert
*
- * @apiviz.has SparseFloatVector
+ * @apiviz.has SparseNumberVector
*/
@Title("Term frequency parser")
@Description("Parse a file containing term frequencies. The expected format is 'label term1 <freq> term2 <freq> ...'. Terms must not contain the separator character!")
-public class TermFrequencyParser<V extends SparseNumberVector<?>> extends NumberVectorLabelParser<V> {
+public class TermFrequencyParser<V extends SparseNumberVector> extends NumberVectorLabelParser<V> {
/**
* Class logger.
*/
@@ -81,7 +84,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
/**
* Same as {@link #factory}, but subtype.
*/
- private SparseNumberVector.Factory<V, ?> sparsefactory;
+ private SparseNumberVector.Factory<V> sparsefactory;
/**
* (Reused) set of values for the number vector.
@@ -92,7 +95,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
* (Reused) label buffer.
*/
ArrayList<String> labels = new ArrayList<>();
-
+
/**
* Constructor.
*
@@ -102,7 +105,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
* @param comment Comment pattern
* @param labelIndices Indices to use as labels
*/
- public TermFrequencyParser(boolean normalize, Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ public TermFrequencyParser(boolean normalize, Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V> factory) {
super(colSep, quoteChars, comment, labelIndices, factory);
this.normalize = normalize;
this.keymap = new TObjectIntHashMap<>(1001, .5f, -1);
@@ -110,42 +113,40 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
}
@Override
- protected void parseLineInternal(String line) {
+ protected boolean parseLineInternal() {
double len = 0;
- values.clear();
- labels.clear();
String curterm = null;
- for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance()) {
+ for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) {
if(curterm == null) {
curterm = tokenizer.getSubstring();
+ continue;
}
- else {
- try {
- double attribute = tokenizer.getDouble();
- int curdim = keymap.get(curterm);
- if(curdim < 0) {
- curdim = numterms;
- keymap.put(curterm, curdim);
- ++numterms;
- }
- values.put(curdim, attribute);
- len += attribute;
- curterm = null;
+ try {
+ double attribute = tokenizer.getDouble();
+ int curdim = keymap.get(curterm);
+ if(curdim < 0) {
+ curdim = numterms;
+ keymap.put(curterm, curdim);
+ ++numterms;
}
- catch(NumberFormatException e) {
- if(curterm != null) {
- labels.add(curterm);
- }
- curterm = tokenizer.getSubstring();
+ values.put(curdim, attribute);
+ len += attribute;
+ curterm = null;
+ }
+ catch(NumberFormatException e) {
+ if(curterm != null) {
+ labels.add(curterm);
}
+ curterm = tokenizer.getSubstring();
}
}
if(curterm != null) {
labels.add(curterm);
}
+ haslabels |= (labels.size() > 0);
if(normalize) {
- if(Math.abs(len - 1.0) > 1E-10 && len > 1E-10) {
+ if(Math.abs(len - 1.0) > Double.MIN_NORMAL) {
for(TIntDoubleIterator iter = values.iterator(); iter.hasNext();) {
iter.advance();
iter.setValue(iter.value() / len);
@@ -155,6 +156,9 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
curvec = sparsefactory.newNumberVector(values, numterms);
curlbl = LabelList.make(labels);
+ values.clear();
+ labels.clear();
+ return true;
}
@Override
@@ -163,7 +167,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
return new VectorFieldTypeInformation<>(factory, mindim);
}
else if(mindim < maxdim) {
- return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
+ return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim);
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@@ -180,7 +184,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
+ public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
/**
* Option ID for normalization.
*/
@@ -196,13 +200,13 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
super.makeOptions(config);
Flag normF = new Flag(NORMALIZE_FLAG);
if(config.grab(normF)) {
- normalize = normF.getValue().booleanValue();
+ normalize = normF.isTrue();
}
}
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
if(config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -210,7 +214,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
@Override
protected TermFrequencyParser<V> makeInstance() {
- return new TermFrequencyParser<>(normalize, colSep, quoteChars, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
+ return new TermFrequencyParser<>(normalize, colSep, quoteChars, comment, labelIndices, (SparseNumberVector.Factory<V>) factory);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/Tokenizer.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/Tokenizer.java
deleted file mode 100644
index 0cf4c81a..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/Tokenizer.java
+++ /dev/null
@@ -1,230 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.parser;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2013
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
-import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.Iter;
-
-/**
- * String tokenizer.
- *
- * @author Erich Schubert
- */
-public class Tokenizer implements Iter {
- /**
- * Class logger.
- */
- private static final Logging LOG = Logging.getLogger(Tokenizer.class);
-
- /**
- * Separator pattern.
- */
- private Pattern colSep;
-
- /**
- * Quote characters
- */
- public static final String QUOTE_CHAR = "\"'";
-
- /**
- * Stores the quotation character
- */
- private char[] quoteChars = QUOTE_CHAR.toCharArray();
-
- /**
- * Constructor.
- *
- * @param colSep Column separator pattern.
- * @param quoteChars Quotation character.
- */
- public Tokenizer(Pattern colSep, String quoteChars) {
- super();
- this.colSep = colSep;
- this.quoteChars = quoteChars.toCharArray();
- }
-
- /**
- * Regular expression match helper.
- */
- private Matcher m = null;
-
- /**
- * Data currently processed.
- */
- private CharSequence input;
-
- /**
- * Substring to process.
- */
- private int send;
-
- /**
- * Current positions of result and iterator.
- */
- private int start, end, index;
-
- /**
- * Initialize parser with a new string.
- *
- * @param input New string to parse.
- * @param begin Begin
- * @param end End
- */
- public void initialize(CharSequence input, int begin, int end) {
- this.input = input;
- this.send = end;
- this.m = colSep.matcher(input).region(begin, end);
- this.index = begin;
- advance();
- }
-
- @Override
- public boolean valid() {
- return start < send;
- }
-
- @Override
- public void advance() {
- char inquote = isQuote(index);
- while(m.find()) {
- // Quoted code path vs. regular code path
- if(inquote != 0) {
- // Matching closing quote found?
- if(m.start() > index + 1 && input.charAt(m.start() - 1) == inquote) {
- this.start = index + 1;
- this.end = m.start() - 1;
- this.index = m.end();
- return;
- }
- continue;
- }
- else {
- this.start = index;
- this.end = m.start();
- this.index = m.end();
- return;
- }
- }
- // Add tail after last separator.
- this.start = index;
- this.end = send;
- this.index = end + 1;
- if(inquote != 0) {
- final int last = send - 1;
- if(input.charAt(last) == inquote) {
- ++this.start;
- --this.end;
- }
- else {
- LOG.warning("Invalid quoted line in input: no closing quote found in: " + input);
- }
- }
- }
-
- /**
- * Get the current part as substring
- *
- * @return Current value as substring.
- */
- public String getSubstring() {
- // TODO: detect Java <6 and make sure we only return the substring?
- // With java 7, String.substring will arraycopy the characters.
- return input.subSequence(start, end).toString();
- }
-
- /**
- * Get current value as double.
- *
- * @return double value
- * @throws NumberFormatException when current value cannot be parsed as double
- * value.
- */
- public double getDouble() throws NumberFormatException {
- return FormatUtil.parseDouble(input, start, end);
- }
-
- /**
- * Get current value as long.
- *
- * @return double value
- * @throws NumberFormatException when current value cannot be parsed as long
- * value.
- */
- public long getLongBase10() throws NumberFormatException {
- return FormatUtil.parseLongBase10(input, start, end);
- }
-
- /**
- * Test for empty tokens; usually at end of line.
- *
- * @return Empty
- */
- public boolean isEmpty() {
- return end <= start;
- }
-
- /**
- * Detect quote characters.
- *
- * TODO: support more than one quote character, make sure opening and closing
- * quotes match then.
- *
- * @param index Position
- * @return {@code 1} when a quote character, {@code 0} otherwise.
- */
- private char isQuote(int index) {
- if(index >= input.length()) {
- return 0;
- }
- char c = input.charAt(index);
- for(int i = 0; i < quoteChars.length; i++) {
- if(c == quoteChars[i]) {
- return c;
- }
- }
- return 0;
- }
-
- /**
- * Get start of token.
- *
- * @return Start
- */
- public int getStart() {
- return start;
- }
-
- /**
- * Get end of token.
- *
- * @return End
- */
- public int getEnd() {
- return end;
- }
-}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
index c21ab31f..fab27a6a 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
@@ -12,14 +12,14 @@
* any {@link de.lmu.ifi.dbs.elki.KDDTask} will
* use the {@link de.lmu.ifi.dbs.elki.database.StaticArrayDatabase} which,
* in turn, will use a {@link de.lmu.ifi.dbs.elki.datasource.FileBasedDatabaseConnection}
- * and a {@link de.lmu.ifi.dbs.elki.datasource.parser.DoubleVectorLabelParser}
+ * and a {@link de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser}
* to parse a specified data file creating
* a {@link de.lmu.ifi.dbs.elki.database.StaticArrayDatabase}
* containing {@link de.lmu.ifi.dbs.elki.data.DoubleVector}-Objects.</p>
*
* <p>Thus, the standard procedure to use a data set of a real-valued vector space
* is to prepare the data set in a file of the following format
- * (as suitable to {@link de.lmu.ifi.dbs.elki.datasource.parser.DoubleVectorLabelParser}):
+ * (as suitable to {@link de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser}):
* <ul>
* <li>One point per line, attributes separated by whitespace.</li>
* <li>Several labels may be given per point. A label must not be parseable as double.</li>
@@ -42,7 +42,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team