diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java | 248 |
1 files changed, 203 insertions, 45 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java index a89f6c3a..01606e77 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,26 +23,33 @@ package de.lmu.ifi.dbs.elki.datasource.parser; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.list.array.TDoubleArrayList; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.ArrayList; import java.util.BitSet; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; +import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; -import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; -import de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle; +import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.persistent.ByteBufferSerializer; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * <p> @@ -59,9 +66,18 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * </p> * * @author Arthur Zimek + * + * @apiviz.landmark + * @apiviz.has NumberVector + * * @param <V> the type of NumberVector used */ -public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> extends AbstractParser implements LinebasedParser, Parser { +public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends AbstractStreamingParser { + /** + * Logging class. + */ + private static final Logging logger = Logging.getLogger(NumberVectorLabelParser.class); + /** * A comma separated list of the indices of labels (may be numeric), counting * whitespace separated entries in a line starting with 0. The corresponding @@ -73,95 +89,186 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte public static final OptionID LABEL_INDICES_ID = OptionID.getOrCreateOptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label."); /** + * Parameter to specify the type of vectors to produce. + * <p> + * Key: {@code -parser.vector-type}<br /> + * Default: DoubleVector + * </p> + */ + public static final OptionID VECTOR_TYPE_ID = OptionID.getOrCreateOptionID("parser.vector-type", "The type of vectors to create for numerical attributes."); + + /** + * Constant used for unknown dimensionality (e.g. empty files) + */ + public static final int DIMENSIONALITY_UNKNOWN = -1; + + /** + * Constant used for records of variable dimensionality (e.g. time series) + */ + public static final int DIMENSIONALITY_VARIABLE = -2; + + /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices; /** + * Vector factory class + */ + protected V factory; + + /** + * Buffer reader + */ + private BufferedReader reader; + + /** + * Current line number + */ + protected int lineNumber; + + /** + * Dimensionality reported + */ + protected int dimensionality; + + /** + * Metadata + */ + protected BundleMeta meta = null; + + /** + * Current vector + */ + protected V curvec = null; + + /** + * Current labels + */ + protected LabelList curlbl = null; + + /** + * Event to report next + */ + Event nextevent = null; + + /** * Constructor * * @param colSep * @param quoteChar * @param labelIndices + * @param factory Vector factory */ - public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) { + public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, V factory) { super(colSep, quoteChar); this.labelIndices = labelIndices; + this.factory = factory; + } + + @Override + public void initStream(InputStream in) { + reader = new BufferedReader(new InputStreamReader(in)); + lineNumber = 1; + dimensionality = DIMENSIONALITY_UNKNOWN; + } + + @Override + public BundleMeta getMeta() { + return meta; } @Override - public MultipleObjectsBundle parse(InputStream in) { - BufferedReader reader = new BufferedReader(new InputStreamReader(in)); - int lineNumber = 1; - int dimensionality = -1; - List<V> vectors = new ArrayList<V>(); - List<LabelList> labels = new ArrayList<LabelList>(); + public Event nextEvent() { + if(nextevent != null) { + Event ret = nextevent; + nextevent = null; + return ret; + } try { for(String line; (line = reader.readLine()) != null; lineNumber++) { if(!line.startsWith(COMMENT) && line.length() > 0) { - Pair<V, LabelList> objectAndLabels = parseLineInternal(line); - if(dimensionality < 0) { - dimensionality = objectAndLabels.getFirst().getDimensionality(); + parseLineInternal(line); + if(dimensionality == DIMENSIONALITY_UNKNOWN) { + dimensionality = curvec.getDimensionality(); + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; } - else if(dimensionality != objectAndLabels.getFirst().getDimensionality()) { - throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ":" + objectAndLabels.getFirst().getDimensionality() + " != " + dimensionality); + else if(dimensionality > 0) { + if(dimensionality != curvec.getDimensionality()) { + dimensionality = DIMENSIONALITY_VARIABLE; + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; + } } - vectors.add(objectAndLabels.first); - labels.add(objectAndLabels.second); + return Event.NEXT_OBJECT; } } + reader.close(); + reader = null; + return Event.END_OF_STREAM; } catch(IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } - return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels); + } + + /** + * Update the meta element. + */ + protected void buildMeta() { + meta = new BundleMeta(2); + meta.add(getTypeInformation(dimensionality)); + meta.add(TypeUtil.LABELLIST); } @Override - public SingleObjectBundle parseLine(String line) { - Pair<V, LabelList> objectAndLabels = parseLineInternal(line); - SingleObjectBundle pkg = new SingleObjectBundle(); - pkg.append(getTypeInformation(objectAndLabels.first.getDimensionality()), objectAndLabels.first); - pkg.append(TypeUtil.LABELLIST, objectAndLabels.second); - return pkg; + public Object data(int rnum) { + if(rnum == 0) { + return curvec; + } + if(rnum == 1) { + return curlbl; + } + throw new ArrayIndexOutOfBoundsException(); } /** - * Internal method for parsing a single line. Used by both line based parsig + * Internal method for parsing a single line. Used by both line based parsing * as well as block parsing. This saves the building of meta data for each * line. * * @param line Line to process - * @return parsing result */ - protected Pair<V, LabelList> parseLineInternal(String line) { + protected void parseLineInternal(String line) { List<String> entries = tokenize(line); - // Split into numerical attributes and labels - List<Double> attributes = new ArrayList<Double>(entries.size()); - LabelList labels = new LabelList(); + TDoubleArrayList attributes = new TDoubleArrayList(entries.size()); + LabelList labels = null; Iterator<String> itr = entries.iterator(); for(int i = 0; itr.hasNext(); i++) { String ent = itr.next(); if(!labelIndices.get(i)) { try { - Double attribute = Double.valueOf(ent); + double attribute = Double.parseDouble(ent); attributes.add(attribute); + continue; } catch(NumberFormatException e) { - labels.add(ent); + // Ignore attempt, add to labels below. } } - else { - labels.add(ent); + if(labels == null) { + labels = new LabelList(1); } + labels.add(ent); } - Pair<V, LabelList> objectAndLabels; - V vec = createDBObject(attributes); - objectAndLabels = new Pair<V, LabelList>(vec, labels); - return objectAndLabels; + curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER); + curlbl = labels; } /** @@ -172,7 +279,9 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte * @param attributes the attributes of the vector to create. * @return a RalVector of type V containing the given attribute values */ - protected abstract V createDBObject(List<Double> attributes); + protected <A> V createDBObject(A attributes, NumberArrayAdapter<?, A> adapter) { + return factory.newNumberVector(attributes, adapter); + } /** * Get a prototype object for the given dimensionality. @@ -180,7 +289,37 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte * @param dimensionality Dimensionality * @return Prototype object */ - abstract protected VectorFieldTypeInformation<V> getTypeInformation(int dimensionality); + SimpleTypeInformation<V> getTypeInformation(int dimensionality) { + @SuppressWarnings("unchecked") + Class<V> cls = (Class<V>) factory.getClass(); + if(dimensionality > 0) { + V f = factory.newNumberVector(new double[dimensionality]); + if(f instanceof ByteBufferSerializer) { + // TODO: Remove, once we have serializers for all types + @SuppressWarnings("unchecked") + final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f; + return new VectorFieldTypeInformation<V>(cls, ser, dimensionality, f); + } + return new VectorFieldTypeInformation<V>(cls, dimensionality, f); + } + // Variable dimensionality - return non-vector field type + if(dimensionality == DIMENSIONALITY_VARIABLE) { + V f = factory.newNumberVector(new double[0]); + if(f instanceof ByteBufferSerializer) { + // TODO: Remove, once we have serializers for all types + @SuppressWarnings("unchecked") + final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f; + return new SimpleTypeInformation<V>(cls, ser); + } + return new SimpleTypeInformation<V>(cls); + } + throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); + } + + @Override + protected Logging getLogger() { + return logger; + } /** * Parameterization class. @@ -189,15 +328,32 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte * * @apiviz.exclude */ - public static abstract class Parameterizer<V extends NumberVector<?, ?>> extends AbstractParser.Parameterizer { + public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParser.Parameterizer { /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices = null; + /** + * Factory + */ + protected V factory; + @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); + getLabelIndices(config); + getFactory(config); + } + + protected void getFactory(Parameterization config) { + ObjectParameter<V> factoryP = new ObjectParameter<V>(VECTOR_TYPE_ID, NumberVector.class, DoubleVector.class); + if(config.grab(factoryP)) { + factory = factoryP.instantiateClass(config); + } + } + + protected void getLabelIndices(Parameterization config) { IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true); labelIndices = new BitSet(); @@ -210,6 +366,8 @@ public abstract class NumberVectorLabelParser<V extends NumberVector<?, ?>> exte } @Override - protected abstract NumberVectorLabelParser<V> makeInstance(); + protected NumberVectorLabelParser<V> makeInstance() { + return new NumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory); + } } }
\ No newline at end of file |