summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java37
1 files changed, 20 insertions, 17 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
index 902d59a9..f9b34c92 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -44,8 +44,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
+ * Parser for parsing one point per line, attributes separated by whitespace.
* </p>
* <p>
* Several labels may be given per point. A label must not be parseable as
@@ -56,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* the number of attributes with coordinate value not zero. Subsequent entries
* are of the form <code>index value </code> each, where index is the number of
* the corresponding dimension, and value is the value of the corresponding
- * attribute. A complet line then could look like this:
+ * attribute. A complete line then could look like this:
*
* <pre>
* 3 7 12.34 8 56.78 11 1.234 objectlabel
@@ -77,10 +76,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> vector type
*/
-// FIXME: Maxdim!
@Title("Sparse Vector Label Parser")
@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored.")
-public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> extends NumberVectorLabelParser<V> {
+public class SparseNumberVectorLabelParser<V extends SparseNumberVector> extends NumberVectorLabelParser<V> {
/**
* Class logger.
*/
@@ -89,7 +87,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
/**
* Same as {@link #factory}, but subtype.
*/
- private SparseNumberVector.Factory<V, ?> sparsefactory;
+ protected SparseNumberVector.Factory<V> sparsefactory;
/**
* (Reused) set of values for the number vector.
@@ -110,18 +108,17 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
* @param labelIndices Indices to use as labels
* @param factory Vector factory
*/
- public SparseNumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ public SparseNumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V> factory) {
super(colSep, quoteChars, comment, labelIndices, factory);
this.sparsefactory = factory;
}
@Override
- protected void parseLineInternal(String line) {
- tokenizer.initialize(line, 0, lengthWithoutLinefeed(line));
+ protected boolean parseLineInternal() {
+ /* tokenizer initialized by nextLineExceptComments() */
int cardinality = (int) tokenizer.getLongBase10();
+ tokenizer.advance();
- values.clear();
- labels.clear();
int thismax = 0;
while(tokenizer.valid()) {
@@ -130,7 +127,10 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
int index = (int) tokenizer.getLongBase10();
tokenizer.advance();
// Respect labelIndices.
- if(labelIndices == null || !labelIndices.get(index)) {
+ if(!isLabelColumn(index)) {
+ if(!tokenizer.valid()) {
+ throw new AbortException("Parser expected double value, but line ended too early: " + getLineNumber());
+ }
double attribute = tokenizer.getDouble();
thismax = Math.max(thismax, index + 1);
values.put(index, attribute);
@@ -149,6 +149,9 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
}
curvec = sparsefactory.newNumberVector(values, thismax);
curlbl = LabelList.make(labels);
+ values.clear();
+ labels.clear();
+ return true;
}
@Override
@@ -157,7 +160,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
return new VectorFieldTypeInformation<>(factory, mindim);
}
else if(mindim < maxdim) {
- return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
+ return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim);
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@@ -174,10 +177,10 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
+ public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
if(config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -185,7 +188,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
@Override
protected SparseNumberVectorLabelParser<V> makeInstance() {
- return new SparseNumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
+ return new SparseNumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, (SparseNumberVector.Factory<V>) factory);
}
}
}