summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java194
1 files changed, 83 insertions, 111 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index 3fe4af09..e09dcd22 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,13 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
import gnu.trove.list.array.TDoubleArrayList;
-import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.BitSet;
-import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
@@ -46,6 +43,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.hash.Unique;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -53,18 +51,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
- * <p>
- * Provides a parser for parsing one point per line, attributes separated by
- * whitespace.
- * </p>
- * <p>
+ * Parser for a simple CSV type of format, with columns separated by the given
+ * pattern (default: whitespace).
+ *
* Several labels may be given per point. A label must not be parseable as
* double. Lines starting with &quot;#&quot; will be ignored.
- * </p>
- * <p>
+ *
* An index can be specified to identify an entry to be treated as class label.
* This index counts all entries (numeric and labels as well) starting with 0.
- * </p>
*
* @author Arthur Zimek
*
@@ -73,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> the type of NumberVector used
*/
-public class NumberVectorLabelParser<V extends NumberVector<?>> extends AbstractStreamingParser {
+public class NumberVectorLabelParser<V extends NumberVector> extends AbstractStreamingParser {
/**
* Logging class.
*/
@@ -82,22 +76,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* Keeps the indices of the attributes to be treated as a string label.
*/
- protected BitSet labelIndices;
+ private BitSet labelIndices;
/**
* Vector factory class.
*/
- protected NumberVector.Factory<V, ?> factory;
-
- /**
- * Buffer reader.
- */
- private BufferedReader reader;
-
- /**
- * Current line number.
- */
- protected int lineNumber;
+ protected NumberVector.Factory<V> factory;
/**
* Dimensionality reported.
@@ -115,11 +99,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
protected List<String> columnnames = null;
/**
- * Bitset to indicate which columns are not numeric.
- */
- protected BitSet labelcolumns = null;
-
- /**
* Whether or not the data set has labels.
*/
protected boolean haslabels = false;
@@ -147,7 +126,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* For String unification.
*/
- HashMap<String, String> unique = new HashMap<>();
+ Unique<String> unique = new Unique<>();
/**
* Event to report next.
@@ -159,7 +138,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*
* @param factory Vector factory
*/
- public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) {
+ public NumberVectorLabelParser(NumberVector.Factory<V> factory) {
this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory);
}
@@ -172,24 +151,30 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param labelIndices Column indexes that are numeric.
* @param factory Vector factory
*/
- public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
+ public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V> factory) {
super(colSep, quoteChars, comment);
this.labelIndices = labelIndices;
this.factory = factory;
}
+ /**
+ * Test if the current column is marked as label column.
+ *
+ * @param col Column number
+ * @return {@code true} when a label column.
+ */
+ protected boolean isLabelColumn(int col) {
+ return labelIndices != null && labelIndices.get(col);
+ }
+
@Override
public void initStream(InputStream in) {
- reader = new BufferedReader(new InputStreamReader(in));
- lineNumber = 1;
+ super.initStream(in);
mindim = Integer.MAX_VALUE;
maxdim = 0;
columnnames = null;
haslabels = false;
- labelcolumns = new BitSet();
- if(labelIndices != null) {
- labelcolumns.or(labelIndices);
- }
+ nextevent = null;
}
@Override
@@ -205,41 +190,37 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
return ret;
}
try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- // Skip empty lines and comments
- if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
- continue;
- }
- parseLineInternal(line);
- // Maybe a header column?
- if(curvec == null) {
- continue;
- }
- final int curdim = curvec.getDimensionality();
- if(curdim > maxdim || mindim > curdim) {
- mindim = Math.min(mindim, curdim);
- maxdim = Math.max(maxdim, curdim);
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
- }
- else if(curlbl != null && meta != null && meta.size() == 1) {
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
+ while(nextLineExceptComments()) {
+ if(parseLineInternal()) {
+ final int curdim = curvec.getDimensionality();
+ if(curdim > maxdim || mindim > curdim) {
+ mindim = (curdim < mindim) ? curdim : mindim;
+ maxdim = (curdim > maxdim) ? curdim : maxdim;
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ }
+ else if(curlbl != null && meta != null && meta.size() == 1) {
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ }
+ return Event.NEXT_OBJECT;
}
- return Event.NEXT_OBJECT;
}
- reader.close();
- reader = null;
- unique.clear();
return Event.END_OF_STREAM;
}
catch(IOException e) {
- throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
+ throw new IllegalArgumentException("Error while parsing line " + getLineNumber() + ".");
}
}
+ @Override
+ public void cleanup() {
+ super.cleanup();
+ unique.clear();
+ }
+
/**
* Update the meta element.
*/
@@ -257,13 +238,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
@Override
public Object data(int rnum) {
- if(rnum == 0) {
- return curvec;
- }
- if(rnum == 1) {
- return curlbl;
+ if(rnum > 1) {
+ throw new ArrayIndexOutOfBoundsException();
}
- throw new ArrayIndexOutOfBoundsException();
+ return (rnum == 0) ? curvec : curlbl;
}
/**
@@ -271,16 +249,14 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* as well as block parsing. This saves the building of meta data for each
* line.
*
- * @param line Line to process
+ * @return {@code true} when a valid line was read, {@code false} on a label
+ * row.
*/
- protected void parseLineInternal(String line) {
- attributes.reset();
- labels.clear();
-
+ protected boolean parseLineInternal() {
// Split into numerical attributes and labels
int i = 0;
- for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) {
- if(labelIndices == null || !labelIndices.get(i)) {
+ for(/* initialized by nextLineExceptComents()! */; tokenizer.valid(); tokenizer.advance(), i++) {
+ if(!isLabelColumn(i) && !tokenizer.isQuoted()) {
try {
double attribute = tokenizer.getDouble();
attributes.add(attribute);
@@ -288,34 +264,30 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
}
catch(NumberFormatException e) {
// Ignore attempt, add to labels below.
- labelcolumns.set(i);
}
}
// Else: labels.
- haslabels = true;
- final String lbl = tokenizer.getSubstring();
- String u = unique.get(lbl);
- if(u == null) {
- u = lbl;
- unique.put(u, u);
+ String lbl = tokenizer.getStrippedSubstring();
+ if(lbl.length() > 0) {
+ haslabels = true;
+ lbl = unique.addOrGet(lbl);
+ labels.add(lbl);
}
- labels.add(u);
}
// Maybe a label row?
- if(lineNumber == 1 && attributes.size() == 0) {
+ if(getLineNumber() == 1 && attributes.size() == 0) {
columnnames = new ArrayList<>(labels);
- labelcolumns.clear();
- if(labelIndices != null) {
- labelcolumns.or(labelIndices);
- }
+ haslabels = false;
curvec = null;
curlbl = null;
- haslabels = false;
- return;
+ return false;
}
// Pass outside via class variables
curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER);
curlbl = LabelList.make(labels);
+ attributes.reset();
+ labels.clear();
+ return true;
}
/**
@@ -338,28 +310,28 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @return Prototype object
*/
SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
+ if(mindim > maxdim) {
+ throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
+ }
if(mindim == maxdim) {
String[] colnames = null;
if(columnnames != null) {
- if(columnnames.size() - labelcolumns.cardinality() == mindim) {
- colnames = new String[mindim];
- for(int i = 0, j = 0; i < columnnames.size(); i++) {
- if(!labelcolumns.get(i)) {
- colnames[j] = columnnames.get(i);
- j++;
- }
+ colnames = new String[mindim];
+ int j = 0;
+ for(int i = 0; i < mindim; i++) {
+ if(!isLabelColumn(i)) {
+ colnames[j] = columnnames.get(i);
+ j++;
}
}
+ if(j == mindim) {
+ colnames = null; // Did not work
+ }
}
return new VectorFieldTypeInformation<>(factory, mindim, colnames);
}
- else if(mindim < maxdim) {
- // Variable dimensionality - return non-vector field type
- return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
- }
- else {
- throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
- }
+ // Variable dimensionality - return non-vector field type
+ return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim);
}
@Override
@@ -374,7 +346,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParser.Parameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParser.Parameterizer {
/**
* A comma separated list of the indices of labels (may be numeric),
* counting whitespace separated entries in a line starting with 0. The
@@ -402,7 +374,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* Factory object.
*/
- protected NumberVector.Factory<V, ?> factory;
+ protected NumberVector.Factory<V> factory;
@Override
protected void makeOptions(Parameterization config) {
@@ -417,7 +389,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param config Parameterization
*/
protected void getFactory(Parameterization config) {
- ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
+ ObjectParameter<NumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
if(config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}