summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java248
1 files changed, 203 insertions, 45 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index a89f6c3a..01606e77 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,26 +23,33 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import gnu.trove.list.array.TDoubleArrayList;
+
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.util.ArrayList;
import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
+import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle;
+import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.persistent.ByteBufferSerializer;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* <p>
@@ -59,9 +66,18 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
* </p>
*
* @author Arthur Zimek
+ *
+ * @apiviz.landmark
+ * @apiviz.has NumberVector
+ *
* @param <V> the type of NumberVector used
*/
-public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> extends AbstractParser implements LinebasedParser, Parser {
+public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends AbstractStreamingParser {
+ /**
+ * Logging class.
+ */
+ private static final Logging logger = Logging.getLogger(NumberVectorLabelParser.class);
+
/**
* A comma separated list of the indices of labels (may be numeric), counting
* whitespace separated entries in a line starting with 0. The corresponding
@@ -73,95 +89,186 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte
public static final OptionID LABEL_INDICES_ID = OptionID.getOrCreateOptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label.");
/**
+ * Parameter to specify the type of vectors to produce.
+ * <p>
+ * Key: {@code -parser.vector-type}<br />
+ * Default: DoubleVector
+ * </p>
+ */
+ public static final OptionID VECTOR_TYPE_ID = OptionID.getOrCreateOptionID("parser.vector-type", "The type of vectors to create for numerical attributes.");
+
+ /**
+ * Constant used for unknown dimensionality (e.g. empty files)
+ */
+ public static final int DIMENSIONALITY_UNKNOWN = -1;
+
+ /**
+ * Constant used for records of variable dimensionality (e.g. time series)
+ */
+ public static final int DIMENSIONALITY_VARIABLE = -2;
+
+ /**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices;
/**
+ * Vector factory class
+ */
+ protected V factory;
+
+ /**
+ * Buffer reader
+ */
+ private BufferedReader reader;
+
+ /**
+ * Current line number
+ */
+ protected int lineNumber;
+
+ /**
+ * Dimensionality reported
+ */
+ protected int dimensionality;
+
+ /**
+ * Metadata
+ */
+ protected BundleMeta meta = null;
+
+ /**
+ * Current vector
+ */
+ protected V curvec = null;
+
+ /**
+ * Current labels
+ */
+ protected LabelList curlbl = null;
+
+ /**
+ * Event to report next
+ */
+ Event nextevent = null;
+
+ /**
* Constructor
*
* @param colSep
* @param quoteChar
* @param labelIndices
+ * @param factory Vector factory
*/
- public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
+ public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, V factory) {
super(colSep, quoteChar);
this.labelIndices = labelIndices;
+ this.factory = factory;
+ }
+
+ @Override
+ public void initStream(InputStream in) {
+ reader = new BufferedReader(new InputStreamReader(in));
+ lineNumber = 1;
+ dimensionality = DIMENSIONALITY_UNKNOWN;
+ }
+
+ @Override
+ public BundleMeta getMeta() {
+ return meta;
}
@Override
- public MultipleObjectsBundle parse(InputStream in) {
- BufferedReader reader = new BufferedReader(new InputStreamReader(in));
- int lineNumber = 1;
- int dimensionality = -1;
- List<V> vectors = new ArrayList<V>();
- List<LabelList> labels = new ArrayList<LabelList>();
+ public Event nextEvent() {
+ if(nextevent != null) {
+ Event ret = nextevent;
+ nextevent = null;
+ return ret;
+ }
try {
for(String line; (line = reader.readLine()) != null; lineNumber++) {
if(!line.startsWith(COMMENT) && line.length() > 0) {
- Pair<V, LabelList> objectAndLabels = parseLineInternal(line);
- if(dimensionality < 0) {
- dimensionality = objectAndLabels.getFirst().getDimensionality();
+ parseLineInternal(line);
+ if(dimensionality == DIMENSIONALITY_UNKNOWN) {
+ dimensionality = curvec.getDimensionality();
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
}
- else if(dimensionality != objectAndLabels.getFirst().getDimensionality()) {
- throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ":" + objectAndLabels.getFirst().getDimensionality() + " != " + dimensionality);
+ else if(dimensionality > 0) {
+ if(dimensionality != curvec.getDimensionality()) {
+ dimensionality = DIMENSIONALITY_VARIABLE;
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ }
}
- vectors.add(objectAndLabels.first);
- labels.add(objectAndLabels.second);
+ return Event.NEXT_OBJECT;
}
}
+ reader.close();
+ reader = null;
+ return Event.END_OF_STREAM;
}
catch(IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
- return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels);
+ }
+
+ /**
+ * Update the meta element.
+ */
+ protected void buildMeta() {
+ meta = new BundleMeta(2);
+ meta.add(getTypeInformation(dimensionality));
+ meta.add(TypeUtil.LABELLIST);
}
@Override
- public SingleObjectBundle parseLine(String line) {
- Pair<V, LabelList> objectAndLabels = parseLineInternal(line);
- SingleObjectBundle pkg = new SingleObjectBundle();
- pkg.append(getTypeInformation(objectAndLabels.first.getDimensionality()), objectAndLabels.first);
- pkg.append(TypeUtil.LABELLIST, objectAndLabels.second);
- return pkg;
+ public Object data(int rnum) {
+ if(rnum == 0) {
+ return curvec;
+ }
+ if(rnum == 1) {
+ return curlbl;
+ }
+ throw new ArrayIndexOutOfBoundsException();
}
/**
- * Internal method for parsing a single line. Used by both line based parsig
+ * Internal method for parsing a single line. Used by both line based parsing
* as well as block parsing. This saves the building of meta data for each
* line.
*
* @param line Line to process
- * @return parsing result
*/
- protected Pair<V, LabelList> parseLineInternal(String line) {
+ protected void parseLineInternal(String line) {
List<String> entries = tokenize(line);
-
// Split into numerical attributes and labels
- List<Double> attributes = new ArrayList<Double>(entries.size());
- LabelList labels = new LabelList();
+ TDoubleArrayList attributes = new TDoubleArrayList(entries.size());
+ LabelList labels = null;
Iterator<String> itr = entries.iterator();
for(int i = 0; itr.hasNext(); i++) {
String ent = itr.next();
if(!labelIndices.get(i)) {
try {
- Double attribute = Double.valueOf(ent);
+ double attribute = Double.parseDouble(ent);
attributes.add(attribute);
+ continue;
}
catch(NumberFormatException e) {
- labels.add(ent);
+ // Ignore attempt, add to labels below.
}
}
- else {
- labels.add(ent);
+ if(labels == null) {
+ labels = new LabelList(1);
}
+ labels.add(ent);
}
- Pair<V, LabelList> objectAndLabels;
- V vec = createDBObject(attributes);
- objectAndLabels = new Pair<V, LabelList>(vec, labels);
- return objectAndLabels;
+ curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER);
+ curlbl = labels;
}
/**
@@ -172,7 +279,9 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte
* @param attributes the attributes of the vector to create.
* @return a RalVector of type V containing the given attribute values
*/
- protected abstract V createDBObject(List<Double> attributes);
+ protected <A> V createDBObject(A attributes, NumberArrayAdapter<?, A> adapter) {
+ return factory.newNumberVector(attributes, adapter);
+ }
/**
* Get a prototype object for the given dimensionality.
@@ -180,7 +289,37 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte
* @param dimensionality Dimensionality
* @return Prototype object
*/
- abstract protected VectorFieldTypeInformation<V> getTypeInformation(int dimensionality);
+ SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
+ @SuppressWarnings("unchecked")
+ Class<V> cls = (Class<V>) factory.getClass();
+ if(dimensionality > 0) {
+ V f = factory.newNumberVector(new double[dimensionality]);
+ if(f instanceof ByteBufferSerializer) {
+ // TODO: Remove, once we have serializers for all types
+ @SuppressWarnings("unchecked")
+ final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f;
+ return new VectorFieldTypeInformation<V>(cls, ser, dimensionality, f);
+ }
+ return new VectorFieldTypeInformation<V>(cls, dimensionality, f);
+ }
+ // Variable dimensionality - return non-vector field type
+ if(dimensionality == DIMENSIONALITY_VARIABLE) {
+ V f = factory.newNumberVector(new double[0]);
+ if(f instanceof ByteBufferSerializer) {
+ // TODO: Remove, once we have serializers for all types
+ @SuppressWarnings("unchecked")
+ final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f;
+ return new SimpleTypeInformation<V>(cls, ser);
+ }
+ return new SimpleTypeInformation<V>(cls);
+ }
+ throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
/**
* Parameterization class.
@@ -189,15 +328,32 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte
*
* @apiviz.exclude
*/
- public static abstract class Parameterizer<V extends NumberVector<?, ?>> extends AbstractParser.Parameterizer {
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParser.Parameterizer {
/**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices = null;
+ /**
+ * Factory
+ */
+ protected V factory;
+
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
+ getLabelIndices(config);
+ getFactory(config);
+ }
+
+ protected void getFactory(Parameterization config) {
+ ObjectParameter<V> factoryP = new ObjectParameter<V>(VECTOR_TYPE_ID, NumberVector.class, DoubleVector.class);
+ if(config.grab(factoryP)) {
+ factory = factoryP.instantiateClass(config);
+ }
+ }
+
+ protected void getLabelIndices(Parameterization config) {
IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true);
labelIndices = new BitSet();
@@ -210,6 +366,8 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte
}
@Override
- protected abstract NumberVectorLabelParser<V> makeInstance();
+ protected NumberVectorLabelParser<V> makeInstance() {
+ return new NumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory);
+ }
}
} \ No newline at end of file