diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java | 194 |
1 files changed, 83 insertions, 111 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java index 3fe4af09..e09dcd22 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,13 +25,10 @@ package de.lmu.ifi.dbs.elki.datasource.parser; import gnu.trove.list.array.TDoubleArrayList; -import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.util.ArrayList; import java.util.BitSet; -import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; @@ -46,6 +43,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.hash.Unique; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -53,18 +51,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * <p> - * Provides a parser for parsing one point per line, attributes separated by - * whitespace. - * </p> - * <p> + * Parser for a simple CSV type of format, with columns separated by the given + * pattern (default: whitespace). + * * Several labels may be given per point. A label must not be parseable as * double. Lines starting with "#" will be ignored. - * </p> - * <p> + * * An index can be specified to identify an entry to be treated as class label. * This index counts all entries (numeric and labels as well) starting with 0. - * </p> * * @author Arthur Zimek * @@ -73,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @param <V> the type of NumberVector used */ -public class NumberVectorLabelParser<V extends NumberVector<?>> extends AbstractStreamingParser { +public class NumberVectorLabelParser<V extends NumberVector> extends AbstractStreamingParser { /** * Logging class. */ @@ -82,22 +76,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract /** * Keeps the indices of the attributes to be treated as a string label. */ - protected BitSet labelIndices; + private BitSet labelIndices; /** * Vector factory class. */ - protected NumberVector.Factory<V, ?> factory; - - /** - * Buffer reader. - */ - private BufferedReader reader; - - /** - * Current line number. - */ - protected int lineNumber; + protected NumberVector.Factory<V> factory; /** * Dimensionality reported. @@ -115,11 +99,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract protected List<String> columnnames = null; /** - * Bitset to indicate which columns are not numeric. - */ - protected BitSet labelcolumns = null; - - /** * Whether or not the data set has labels. */ protected boolean haslabels = false; @@ -147,7 +126,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract /** * For String unification. */ - HashMap<String, String> unique = new HashMap<>(); + Unique<String> unique = new Unique<>(); /** * Event to report next. @@ -159,7 +138,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * * @param factory Vector factory */ - public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) { + public NumberVectorLabelParser(NumberVector.Factory<V> factory) { this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory); } @@ -172,24 +151,30 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @param labelIndices Column indexes that are numeric. * @param factory Vector factory */ - public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) { + public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V> factory) { super(colSep, quoteChars, comment); this.labelIndices = labelIndices; this.factory = factory; } + /** + * Test if the current column is marked as label column. + * + * @param col Column number + * @return {@code true} when a label column. + */ + protected boolean isLabelColumn(int col) { + return labelIndices != null && labelIndices.get(col); + } + @Override public void initStream(InputStream in) { - reader = new BufferedReader(new InputStreamReader(in)); - lineNumber = 1; + super.initStream(in); mindim = Integer.MAX_VALUE; maxdim = 0; columnnames = null; haslabels = false; - labelcolumns = new BitSet(); - if(labelIndices != null) { - labelcolumns.or(labelIndices); - } + nextevent = null; } @Override @@ -205,41 +190,37 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract return ret; } try { - for(String line; (line = reader.readLine()) != null; lineNumber++) { - // Skip empty lines and comments - if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { - continue; - } - parseLineInternal(line); - // Maybe a header column? - if(curvec == null) { - continue; - } - final int curdim = curvec.getDimensionality(); - if(curdim > maxdim || mindim > curdim) { - mindim = Math.min(mindim, curdim); - maxdim = Math.max(maxdim, curdim); - buildMeta(); - nextevent = Event.NEXT_OBJECT; - return Event.META_CHANGED; - } - else if(curlbl != null && meta != null && meta.size() == 1) { - buildMeta(); - nextevent = Event.NEXT_OBJECT; - return Event.META_CHANGED; + while(nextLineExceptComments()) { + if(parseLineInternal()) { + final int curdim = curvec.getDimensionality(); + if(curdim > maxdim || mindim > curdim) { + mindim = (curdim < mindim) ? curdim : mindim; + maxdim = (curdim > maxdim) ? curdim : maxdim; + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; + } + else if(curlbl != null && meta != null && meta.size() == 1) { + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; + } + return Event.NEXT_OBJECT; } - return Event.NEXT_OBJECT; } - reader.close(); - reader = null; - unique.clear(); return Event.END_OF_STREAM; } catch(IOException e) { - throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); + throw new IllegalArgumentException("Error while parsing line " + getLineNumber() + "."); } } + @Override + public void cleanup() { + super.cleanup(); + unique.clear(); + } + /** * Update the meta element. */ @@ -257,13 +238,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract @Override public Object data(int rnum) { - if(rnum == 0) { - return curvec; - } - if(rnum == 1) { - return curlbl; + if(rnum > 1) { + throw new ArrayIndexOutOfBoundsException(); } - throw new ArrayIndexOutOfBoundsException(); + return (rnum == 0) ? curvec : curlbl; } /** @@ -271,16 +249,14 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * as well as block parsing. This saves the building of meta data for each * line. * - * @param line Line to process + * @return {@code true} when a valid line was read, {@code false} on a label + * row. */ - protected void parseLineInternal(String line) { - attributes.reset(); - labels.clear(); - + protected boolean parseLineInternal() { // Split into numerical attributes and labels int i = 0; - for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) { - if(labelIndices == null || !labelIndices.get(i)) { + for(/* initialized by nextLineExceptComents()! */; tokenizer.valid(); tokenizer.advance(), i++) { + if(!isLabelColumn(i) && !tokenizer.isQuoted()) { try { double attribute = tokenizer.getDouble(); attributes.add(attribute); @@ -288,34 +264,30 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract } catch(NumberFormatException e) { // Ignore attempt, add to labels below. - labelcolumns.set(i); } } // Else: labels. - haslabels = true; - final String lbl = tokenizer.getSubstring(); - String u = unique.get(lbl); - if(u == null) { - u = lbl; - unique.put(u, u); + String lbl = tokenizer.getStrippedSubstring(); + if(lbl.length() > 0) { + haslabels = true; + lbl = unique.addOrGet(lbl); + labels.add(lbl); } - labels.add(u); } // Maybe a label row? - if(lineNumber == 1 && attributes.size() == 0) { + if(getLineNumber() == 1 && attributes.size() == 0) { columnnames = new ArrayList<>(labels); - labelcolumns.clear(); - if(labelIndices != null) { - labelcolumns.or(labelIndices); - } + haslabels = false; curvec = null; curlbl = null; - haslabels = false; - return; + return false; } // Pass outside via class variables curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER); curlbl = LabelList.make(labels); + attributes.reset(); + labels.clear(); + return true; } /** @@ -338,28 +310,28 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @return Prototype object */ SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) { + if(mindim > maxdim) { + throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); + } if(mindim == maxdim) { String[] colnames = null; if(columnnames != null) { - if(columnnames.size() - labelcolumns.cardinality() == mindim) { - colnames = new String[mindim]; - for(int i = 0, j = 0; i < columnnames.size(); i++) { - if(!labelcolumns.get(i)) { - colnames[j] = columnnames.get(i); - j++; - } + colnames = new String[mindim]; + int j = 0; + for(int i = 0; i < mindim; i++) { + if(!isLabelColumn(i)) { + colnames[j] = columnnames.get(i); + j++; } } + if(j == mindim) { + colnames = null; // Did not work + } } return new VectorFieldTypeInformation<>(factory, mindim, colnames); } - else if(mindim < maxdim) { - // Variable dimensionality - return non-vector field type - return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim); - } - else { - throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); - } + // Variable dimensionality - return non-vector field type + return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim); } @Override @@ -374,7 +346,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParser.Parameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParser.Parameterizer { /** * A comma separated list of the indices of labels (may be numeric), * counting whitespace separated entries in a line starting with 0. The @@ -402,7 +374,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract /** * Factory object. */ - protected NumberVector.Factory<V, ?> factory; + protected NumberVector.Factory<V> factory; @Override protected void makeOptions(Parameterization config) { @@ -417,7 +389,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @param config Parameterization */ protected void getFactory(Parameterization config) { - ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class); + ObjectParameter<NumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class); if(config.grab(factoryP)) { factory = factoryP.instantiateClass(config); } |