summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java3
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java68
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java141
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java102
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java185
7 files changed, 281 insertions, 237 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
index dcfb8245..3c294ca4 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
@@ -46,12 +46,12 @@ public abstract class AbstractParser {
/**
* A pattern defining whitespace.
*/
- public static final String WHITESPACE_PATTERN = "\\s+";
+ public static final String DEFAULT_SEPARATOR = "(\\s+|\\s*[,;]\\s*)";
/**
* A quote pattern
*/
- public static final String QUOTE_CHAR = "\"";
+ public static final char QUOTE_CHAR = '\"';
/**
* A pattern catching most numbers that can be parsed using Double.parseDouble:
@@ -63,7 +63,7 @@ public abstract class AbstractParser {
/**
* OptionID for the column separator parameter (defaults to whitespace as in
- * {@link #WHITESPACE_PATTERN}.
+ * {@link #DEFAULT_SEPARATOR}.
*/
public static final OptionID COLUMN_SEPARATOR_ID = OptionID.getOrCreateOptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data.");
@@ -81,7 +81,7 @@ public abstract class AbstractParser {
/**
* Stores the quotation character
*/
- protected char quoteChar = QUOTE_CHAR.charAt(0);
+ protected char quoteChar = QUOTE_CHAR;
/**
* The comment character.
@@ -205,16 +205,16 @@ public abstract class AbstractParser {
/**
* Stores the quotation character
*/
- protected char quoteChar = QUOTE_CHAR.charAt(0);
+ protected char quoteChar = QUOTE_CHAR;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, WHITESPACE_PATTERN);
+ PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, DEFAULT_SEPARATOR);
if(config.grab(colParam)) {
colSep = colParam.getValue();
}
- StringParameter quoteParam = new StringParameter(QUOTE_ID, new StringLengthConstraint(1, 1), QUOTE_CHAR);
+ StringParameter quoteParam = new StringParameter(QUOTE_ID, new StringLengthConstraint(1, 1), ""+QUOTE_CHAR);
if(config.grab(quoteParam)) {
quoteChar = quoteParam.getValue().charAt(0);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
index 97c3a8c8..7bc52a29 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
@@ -48,7 +48,10 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
* @author Arthur Zimek
*
* @apiviz.has DoubleVector
+ *
+ * @deprecated Use NumberVectorLabelParser instead, which defaults to DoubleVector.
*/
+@Deprecated
public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVector> {
/**
* Class logger
@@ -70,7 +73,7 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto
* Constructor with default values.
*/
public DoubleVectorLabelParser() {
- this(Pattern.compile(WHITESPACE_PATTERN), QUOTE_CHAR.charAt(0), new BitSet());
+ this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, new BitSet());
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
index 3ea2ac80..6465f89a 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
@@ -51,7 +51,10 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
* @author Arthur Zimek
*
* @apiviz.has FloatVector
+ *
+ * @deprecated Use NumberVectorLabelParser instead, and use vector type FloatVector.
*/
+@Deprecated
public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector> {
/**
* Class logger
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index 01606e77..53ce44bc 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -138,6 +138,16 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
protected BundleMeta meta = null;
/**
+ * Column names
+ */
+ protected List<String> columnnames = null;
+
+ /**
+ * Bitset to indicate which columns are numeric
+ */
+ protected BitSet labelcolumns = null;
+
+ /**
* Current vector
*/
protected V curvec = null;
@@ -153,6 +163,15 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
Event nextevent = null;
/**
+ * Constructor with defaults
+ *
+ * @param factory Vector factory
+ */
+ public NumberVectorLabelParser(V factory) {
+ this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, null, factory);
+ }
+
+ /**
* Constructor
*
* @param colSep
@@ -171,6 +190,8 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
reader = new BufferedReader(new InputStreamReader(in));
lineNumber = 1;
dimensionality = DIMENSIONALITY_UNKNOWN;
+ columnnames = null;
+ labelcolumns = new BitSet();
}
@Override
@@ -189,6 +210,10 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
for(String line; (line = reader.readLine()) != null; lineNumber++) {
if(!line.startsWith(COMMENT) && line.length() > 0) {
parseLineInternal(line);
+ // Maybe a header column?
+ if(curvec == null) {
+ continue;
+ }
if(dimensionality == DIMENSIONALITY_UNKNOWN) {
dimensionality = curvec.getDimensionality();
buildMeta();
@@ -202,6 +227,10 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
nextevent = Event.NEXT_OBJECT;
return Event.META_CHANGED;
}
+ } else if (curlbl != null && meta != null && meta.size() == 1) {
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
}
return Event.NEXT_OBJECT;
}
@@ -219,9 +248,15 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
* Update the meta element.
*/
protected void buildMeta() {
- meta = new BundleMeta(2);
- meta.add(getTypeInformation(dimensionality));
- meta.add(TypeUtil.LABELLIST);
+ if(labelcolumns.cardinality() > 0) {
+ meta = new BundleMeta(2);
+ meta.add(getTypeInformation(dimensionality));
+ meta.add(TypeUtil.LABELLIST);
+ }
+ else {
+ meta = new BundleMeta(1);
+ meta.add(getTypeInformation(dimensionality));
+ }
}
@Override
@@ -259,6 +294,7 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
}
catch(NumberFormatException e) {
// Ignore attempt, add to labels below.
+ labelcolumns.set(i);
}
}
if(labels == null) {
@@ -266,7 +302,15 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
}
labels.add(ent);
}
-
+ // Maybe a label row?
+ if(lineNumber == 1 && attributes.size() == 0) {
+ columnnames = labels;
+ labelcolumns.clear();
+ curvec = null;
+ curlbl = null;
+ return;
+ }
+ // Pass outside via class variables
curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER);
curlbl = labels;
}
@@ -293,14 +337,26 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
@SuppressWarnings("unchecked")
Class<V> cls = (Class<V>) factory.getClass();
if(dimensionality > 0) {
+ String[] colnames = null;
+ if(columnnames != null) {
+ if(columnnames.size() - labelcolumns.cardinality() == dimensionality) {
+ colnames = new String[dimensionality];
+ for(int i = 0, j = 0; i < columnnames.size(); i++) {
+ if(labelcolumns.get(i) == false) {
+ colnames[j] = columnnames.get(i);
+ j++;
+ }
+ }
+ }
+ }
V f = factory.newNumberVector(new double[dimensionality]);
if(f instanceof ByteBufferSerializer) {
// TODO: Remove, once we have serializers for all types
@SuppressWarnings("unchecked")
final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f;
- return new VectorFieldTypeInformation<V>(cls, ser, dimensionality, f);
+ return new VectorFieldTypeInformation<V>(cls, ser, dimensionality, colnames, f);
}
- return new VectorFieldTypeInformation<V>(cls, dimensionality, f);
+ return new VectorFieldTypeInformation<V>(cls, dimensionality, colnames, f);
}
// Variable dimensionality - return non-vector field type
if(dimensionality == DIMENSIONALITY_VARIABLE) {
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java
deleted file mode 100644
index 8e240ae9..00000000
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java
+++ /dev/null
@@ -1,141 +0,0 @@
-package de.lmu.ifi.dbs.elki.datasource.parser;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2012
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import gnu.trove.list.TDoubleList;
-import gnu.trove.list.array.TDoubleArrayList;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.data.LabelList;
-import de.lmu.ifi.dbs.elki.data.ParameterizationFunction;
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-
-/**
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace. The parser transforms each point into a parameterization function.
- * Several labels may be given per point. A label must not be parseable as
- * double (or float). Lines starting with &quot;#&quot; will be ignored.
- *
- * @author Arthur Zimek
- *
- * @apiviz.has ParameterizationFunction
- */
-@Title("Parameterization Function Label Parser")
-@Description("Parser for the following line format:\n" + "A single line provides a single point. Attributes are separated by whitespace. The real values will be parsed as as doubles. Any substring not containing whitespace is tried to be read as double. If this fails, it will be appended to a label. (Thus, any label must not be parseable " + "as double.) Empty lines and lines beginning with \"#\" will be ignored. If any point differs in its dimensionality from other points, the parse method will fail with an Exception.")
-public class ParameterizationFunctionLabelParser extends AbstractParser implements Parser {
- /**
- * Class logger
- */
- private static final Logging logger = Logging.getLogger(ParameterizationFunctionLabelParser.class);
-
- /**
- * Constructor.
- *
- * @param colSep
- * @param quoteChar
- */
- public ParameterizationFunctionLabelParser(Pattern colSep, char quoteChar) {
- super(colSep, quoteChar);
- }
-
- @Override
- public MultipleObjectsBundle parse(InputStream in) {
- BufferedReader reader = new BufferedReader(new InputStreamReader(in));
- int lineNumber = 1;
- int dimensionality = -1;
- List<ParameterizationFunction> vectors = new ArrayList<ParameterizationFunction>();
- List<LabelList> labels = new ArrayList<LabelList>();
- try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- if(!line.startsWith(COMMENT) && line.length() > 0) {
- List<String> entries = tokenize(line);
- TDoubleList attributes = new TDoubleArrayList(entries.size());
- LabelList labellist = null;
- for(String entry : entries) {
- try {
- double attribute = Double.parseDouble(entry);
- attributes.add(attribute);
- }
- catch(NumberFormatException e) {
- if(labellist == null) {
- labellist = new LabelList(1);
- }
- labellist.add(entry);
- }
- }
-
- if(dimensionality < 0) {
- dimensionality = attributes.size();
- }
- else if(dimensionality != attributes.size()) {
- throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ":" + attributes.size() + " != " + dimensionality);
- }
- vectors.add(ParameterizationFunction.STATIC.newNumberVector(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER));
- labels.add(labellist);
- }
- }
- }
- catch(IOException e) {
- throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
- }
-
- return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels);
- }
-
- protected VectorFieldTypeInformation<ParameterizationFunction> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<ParameterizationFunction>(ParameterizationFunction.class, dimensionality, new ParameterizationFunction(new double[dimensionality]));
- }
-
- @Override
- protected Logging getLogger() {
- return logger;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer extends AbstractParser.Parameterizer {
- @Override
- protected ParameterizationFunctionLabelParser makeInstance() {
- return new ParameterizationFunctionLabelParser(colSep, quoteChar);
- }
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
index 83d6ed13..d0ab7a85 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
@@ -23,20 +23,12 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import gnu.trove.map.hash.TIntFloatHashMap;
-
import java.util.BitSet;
-import java.util.List;
import java.util.regex.Pattern;
-import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
-import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
-import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
-import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
/**
* <p>
@@ -50,9 +42,17 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
* <p>
* A line is expected in the following format: The first entry of each line is
* the number of attributes with coordinate value not zero. Subsequent entries
- * are of the form (index, value), where index is the number of the
- * corresponding dimension, and value is the value of the corresponding
- * attribute.
+ * are of the form <code>index value </code> each, where index is the number of
+ * the corresponding dimension, and value is the value of the corresponding
+ * attribute. A complet line then could look like this:
+ *
+ * <pre>
+ * 3 7 12.34 8 56.78 11 1.234 objectlabel
+ * </pre>
+ *
+ * where <code>3</code> indicates there are three attributes set,
+ * <code>7,8,11</code> are the attributes indexes and there is a non-numerical
+ * object label.
* </p>
* <p>
* An index can be specified to identify an entry to be treated as class label.
@@ -62,87 +62,25 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
* @author Arthur Zimek
*
* @apiviz.has SparseFloatVector
+ *
+ * @deprecated Use {@link SparseNumberVectorLabelParser} instead!
*/
// FIXME: Maxdim!
@Title("Sparse Float Vector Label Parser")
-@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored. Having the file parsed completely, the maximum occuring dimensionality is set as dimensionality to all created SparseFloatvectors.")
-public class SparseFloatVectorLabelParser extends NumberVectorLabelParser<SparseFloatVector> {
- /**
- * Class logger
- */
- private static final Logging logger = Logging.getLogger(SparseFloatVectorLabelParser.class);
-
- /**
- * Holds the dimensionality of the parsed data which is the maximum occurring
- * index of any attribute.
- */
- private int maxdim = -1;
-
+@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored.")
+@Deprecated
+public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser<SparseFloatVector> {
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param labelIndices Label indexes
*/
public SparseFloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
super(colSep, quoteChar, labelIndices, SparseFloatVector.STATIC);
}
- @Override
- protected void parseLineInternal(String line) {
- List<String> entries = tokenize(line);
- int cardinality = Integer.parseInt(entries.get(0));
-
- TIntFloatHashMap values = new TIntFloatHashMap(cardinality, 1);
- LabelList labels = null;
-
- for(int i = 1; i < entries.size() - 1; i++) {
- if(!labelIndices.get(i)) {
- try {
- int index = Integer.valueOf(entries.get(i));
- if(index > maxdim) {
- maxdim = index;
- }
- float attribute = Float.valueOf(entries.get(i));
- values.put(index, attribute);
- i++;
- }
- catch(NumberFormatException e) {
- if(labels == null) {
- labels = new LabelList(1);
- }
- labels.add(entries.get(i));
- continue;
- }
- }
- else {
- if(labels == null) {
- labels = new LabelList(1);
- }
- labels.add(entries.get(i));
- }
- }
- curvec = new SparseFloatVector(values, maxdim);
- curlbl = labels;
- }
-
- @Override
- protected SimpleTypeInformation<SparseFloatVector> getTypeInformation(int dimensionality) {
- if(dimensionality > 0) {
- return new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, dimensionality, new SparseFloatVector(SparseFloatVector.EMPTYMAP, dimensionality));
- }
- if(dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<SparseFloatVector>(SparseFloatVector.class);
- }
- throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
- }
-
- @Override
- protected Logging getLogger() {
- return logger;
- }
-
/**
* Parameterization class.
*
@@ -150,7 +88,7 @@ public class SparseFloatVectorLabelParser extends NumberVectorLabelParser<Sparse
*
* @apiviz.exclude
*/
- public static class Parameterizer extends NumberVectorLabelParser.Parameterizer<SparseFloatVector> {
+ public static class Parameterizer extends SparseNumberVectorLabelParser.Parameterizer<SparseFloatVector> {
@Override
protected SparseFloatVectorLabelParser makeInstance() {
return new SparseFloatVectorLabelParser(colSep, quoteChar, labelIndices);
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
new file mode 100644
index 00000000..917dd2aa
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
@@ -0,0 +1,185 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.map.hash.TIntDoubleHashMap;
+
+import java.util.BitSet;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.LabelList;
+import de.lmu.ifi.dbs.elki.data.SparseDoubleVector;
+import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
+import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * <p>
+ * Provides a parser for parsing one point per line, attributes separated by
+ * whitespace.
+ * </p>
+ * <p>
+ * Several labels may be given per point. A label must not be parseable as
+ * double. Lines starting with &quot;#&quot; will be ignored.
+ * </p>
+ * <p>
+ * A line is expected in the following format: The first entry of each line is
+ * the number of attributes with coordinate value not zero. Subsequent entries
+ * are of the form <code>index value </code> each, where index is the number of
+ * the corresponding dimension, and value is the value of the corresponding
+ * attribute. A complet line then could look like this:
+ *
+ * <pre>
+ * 3 7 12.34 8 56.78 11 1.234 objectlabel
+ * </pre>
+ *
+ * where <code>3</code> indicates there are three attributes set,
+ * <code>7,8,11</code> are the attributes indexes and there is a non-numerical
+ * object label.
+ * </p>
+ * <p>
+ * An index can be specified to identify an entry to be treated as class label.
+ * This index counts all entries (numeric and labels as well) starting with 0.
+ * </p>
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.has SparseNumberVector
+ */
+// FIXME: Maxdim!
+@Title("Sparse Vector Label Parser")
+@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored.")
+public class SparseNumberVectorLabelParser<V extends SparseNumberVector<V, ?>> extends NumberVectorLabelParser<V> {
+ /**
+ * Class logger
+ */
+ private static final Logging logger = Logging.getLogger(SparseNumberVectorLabelParser.class);
+
+ /**
+ * Holds the dimensionality of the parsed data which is the maximum occurring
+ * index of any attribute.
+ */
+ private int maxdim = -1;
+
+ /**
+ * Constructor.
+ *
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param labelIndices Label indexes
+ * @param factory Vector factory
+ */
+ public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, V factory) {
+ super(colSep, quoteChar, labelIndices, factory);
+ }
+
+ @Override
+ protected void parseLineInternal(String line) {
+ List<String> entries = tokenize(line);
+ int cardinality = Integer.parseInt(entries.get(0));
+
+ TIntDoubleHashMap values = new TIntDoubleHashMap(cardinality, 1);
+ LabelList labels = null;
+
+ for(int i = 1; i < entries.size() - 1; i++) {
+ if(!labelIndices.get(i)) {
+ try {
+ int index = Integer.valueOf(entries.get(i));
+ if(index >= maxdim) {
+ maxdim = index + 1;
+ }
+ double attribute = Double.valueOf(entries.get(i));
+ values.put(index, attribute);
+ i++;
+ }
+ catch(NumberFormatException e) {
+ if(labels == null) {
+ labels = new LabelList(1);
+ }
+ labels.add(entries.get(i));
+ continue;
+ }
+ }
+ else {
+ if(labels == null) {
+ labels = new LabelList(1);
+ }
+ labels.add(entries.get(i));
+ }
+ }
+ if(values.size() > maxdim) {
+ throw new AbortException("Invalid sparse vector seen: " + line);
+ }
+ curvec = factory.newNumberVector(values, maxdim);
+ curlbl = labels;
+ }
+
+ @Override
+ protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
+ @SuppressWarnings("unchecked")
+ Class<V> cls = (Class<V>) factory.getClass();
+ if(dimensionality > 0) {
+ return new VectorFieldTypeInformation<V>(cls, dimensionality, factory.newNumberVector(SparseDoubleVector.EMPTYMAP, dimensionality));
+ }
+ if(dimensionality == DIMENSIONALITY_VARIABLE) {
+ return new SimpleTypeInformation<V>(cls);
+ }
+ throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends SparseNumberVector<V, ?>> extends NumberVectorLabelParser.Parameterizer<V> {
+ @Override
+ protected void getFactory(Parameterization config) {
+ ObjectParameter<V> factoryP = new ObjectParameter<V>(VECTOR_TYPE_ID, SparseNumberVector.class, SparseFloatVector.class);
+ if(config.grab(factoryP)) {
+ factory = factoryP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected SparseNumberVectorLabelParser<V> makeInstance() {
+ return new SparseNumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory);
+ }
+ }
+} \ No newline at end of file