diff options
author | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:38 +0000 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:38 +0000 |
commit | 14a486343aef55f97f54082d6b542dedebf6f3ba (patch) | |
tree | 000fcc4968578771ad265079eef7617d66de2cda /src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java | |
parent | 8300861dc4c62c5567a4e654976072f854217544 (diff) |
Import Upstream version 0.6.0
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java | 167 |
1 files changed, 96 insertions, 71 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java index 39da752b..3fe4af09 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java @@ -29,8 +29,9 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.util.ArrayList; import java.util.BitSet; -import java.util.Iterator; +import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; @@ -79,25 +80,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract private static final Logging LOG = Logging.getLogger(NumberVectorLabelParser.class); /** - * A comma separated list of the indices of labels (may be numeric), counting - * whitespace separated entries in a line starting with 0. The corresponding - * entries will be treated as a label. - * <p> - * Key: {@code -parser.labelIndices} - * </p> - */ - public static final OptionID LABEL_INDICES_ID = new OptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label."); - - /** - * Parameter to specify the type of vectors to produce. - * <p> - * Key: {@code -parser.vector-type}<br /> - * Default: DoubleVector - * </p> - */ - public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes."); - - /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices; @@ -138,6 +120,11 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract protected BitSet labelcolumns = null; /** + * Whether or not the data set has labels. + */ + protected boolean haslabels = false; + + /** * Current vector. */ protected V curvec = null; @@ -148,6 +135,21 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract protected LabelList curlbl = null; /** + * (Reused) store for numerical attributes. + */ + final TDoubleArrayList attributes = new TDoubleArrayList(); + + /** + * (Reused) store for labels. + */ + final ArrayList<String> labels = new ArrayList<>(); + + /** + * For String unification. + */ + HashMap<String, String> unique = new HashMap<>(); + + /** * Event to report next. */ Event nextevent = null; @@ -158,20 +160,20 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @param factory Vector factory */ public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) { - this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, Pattern.compile(COMMENT_PATTERN), null, factory); + this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory); } /** * Constructor. * * @param colSep Column separator - * @param quoteChar Quote character + * @param quoteChars Quote character * @param comment Comment pattern * @param labelIndices Column indexes that are numeric. * @param factory Vector factory */ - public NumberVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) { - super(colSep, quoteChar, comment); + public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) { + super(colSep, quoteChars, comment); this.labelIndices = labelIndices; this.factory = factory; } @@ -183,8 +185,9 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract mindim = Integer.MAX_VALUE; maxdim = 0; columnnames = null; + haslabels = false; labelcolumns = new BitSet(); - if (labelIndices != null) { + if(labelIndices != null) { labelcolumns.or(labelIndices); } } @@ -196,36 +199,31 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract @Override public Event nextEvent() { - if (nextevent != null) { + if(nextevent != null) { Event ret = nextevent; nextevent = null; return ret; } try { - for (String line; (line = reader.readLine()) != null; lineNumber++) { + for(String line; (line = reader.readLine()) != null; lineNumber++) { // Skip empty lines and comments - if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { + if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { continue; } parseLineInternal(line); // Maybe a header column? - if (curvec == null) { + if(curvec == null) { continue; } final int curdim = curvec.getDimensionality(); - if (maxdim < mindim) { - mindim = curdim; - maxdim = curdim; - buildMeta(); - nextevent = Event.NEXT_OBJECT; - return Event.META_CHANGED; - } else if (mindim < curdim || maxdim > curdim) { + if(curdim > maxdim || mindim > curdim) { mindim = Math.min(mindim, curdim); maxdim = Math.max(maxdim, curdim); buildMeta(); nextevent = Event.NEXT_OBJECT; return Event.META_CHANGED; - } else if (curlbl != null && meta != null && meta.size() == 1) { + } + else if(curlbl != null && meta != null && meta.size() == 1) { buildMeta(); nextevent = Event.NEXT_OBJECT; return Event.META_CHANGED; @@ -234,8 +232,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract } reader.close(); reader = null; + unique.clear(); return Event.END_OF_STREAM; - } catch (IOException e) { + } + catch(IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } } @@ -244,11 +244,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * Update the meta element. */ protected void buildMeta() { - if (labelcolumns.cardinality() > 0 || (labelIndices != null && labelIndices.cardinality() > 0)) { + if(haslabels) { meta = new BundleMeta(2); meta.add(getTypeInformation(mindim, maxdim)); meta.add(TypeUtil.LABELLIST); - } else { + } + else { meta = new BundleMeta(1); meta.add(getTypeInformation(mindim, maxdim)); } @@ -256,10 +257,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract @Override public Object data(int rnum) { - if (rnum == 0) { + if(rnum == 0) { return curvec; } - if (rnum == 1) { + if(rnum == 1) { return curlbl; } throw new ArrayIndexOutOfBoundsException(); @@ -273,45 +274,48 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @param line Line to process */ protected void parseLineInternal(String line) { - List<String> entries = tokenize(line); - // Split into numerical attributes and labels - TDoubleArrayList attributes = new TDoubleArrayList(entries.size()); - LabelList labels = null; + attributes.reset(); + labels.clear(); - Iterator<String> itr = entries.iterator(); - for (int i = 0; itr.hasNext(); i++) { - String ent = itr.next(); - if (labelIndices == null || !labelIndices.get(i)) { + // Split into numerical attributes and labels + int i = 0; + for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) { + if(labelIndices == null || !labelIndices.get(i)) { try { - double attribute = parseDouble(ent); + double attribute = tokenizer.getDouble(); attributes.add(attribute); continue; - } catch (NumberFormatException e) { + } + catch(NumberFormatException e) { // Ignore attempt, add to labels below. labelcolumns.set(i); } } // Else: labels. - if (labels == null) { - labels = new LabelList(1); + haslabels = true; + final String lbl = tokenizer.getSubstring(); + String u = unique.get(lbl); + if(u == null) { + u = lbl; + unique.put(u, u); } - // Make a new string, to not keep the whole file in memory! - labels.add(new String(ent)); + labels.add(u); } // Maybe a label row? - if (lineNumber == 1 && attributes.size() == 0) { - columnnames = labels; + if(lineNumber == 1 && attributes.size() == 0) { + columnnames = new ArrayList<>(labels); labelcolumns.clear(); - if (labelIndices != null) { + if(labelIndices != null) { labelcolumns.or(labelIndices); } curvec = null; curlbl = null; + haslabels = false; return; } // Pass outside via class variables curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER); - curlbl = labels; + curlbl = LabelList.make(labels); } /** @@ -334,13 +338,13 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @return Prototype object */ SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) { - if (mindim == maxdim) { + if(mindim == maxdim) { String[] colnames = null; - if (columnnames != null) { - if (columnnames.size() - labelcolumns.cardinality() == mindim) { + if(columnnames != null) { + if(columnnames.size() - labelcolumns.cardinality() == mindim) { colnames = new String[mindim]; - for (int i = 0, j = 0; i < columnnames.size(); i++) { - if (!labelcolumns.get(i)) { + for(int i = 0, j = 0; i < columnnames.size(); i++) { + if(!labelcolumns.get(i)) { colnames[j] = columnnames.get(i); j++; } @@ -348,10 +352,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract } } return new VectorFieldTypeInformation<>(factory, mindim, colnames); - } else if (mindim < maxdim) { + } + else if(mindim < maxdim) { // Variable dimensionality - return non-vector field type return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim); - } else { + } + else { throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); } } @@ -370,6 +376,25 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract */ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParser.Parameterizer { /** + * A comma separated list of the indices of labels (may be numeric), + * counting whitespace separated entries in a line starting with 0. The + * corresponding entries will be treated as a label. + * <p> + * Key: {@code -parser.labelIndices} + * </p> + */ + public static final OptionID LABEL_INDICES_ID = new OptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label."); + + /** + * Parameter to specify the type of vectors to produce. + * <p> + * Key: {@code -parser.vector-type}<br /> + * Default: DoubleVector + * </p> + */ + public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes."); + + /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices = null; @@ -393,7 +418,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract */ protected void getFactory(Parameterization config) { ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class); - if (config.grab(factoryP)) { + if(config.grab(factoryP)) { factory = factoryP.instantiateClass(config); } } @@ -406,10 +431,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract protected void getLabelIndices(Parameterization config) { IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true); - if (config.grab(labelIndicesP)) { + if(config.grab(labelIndicesP)) { labelIndices = new BitSet(); List<Integer> labelcols = labelIndicesP.getValue(); - for (Integer idx : labelcols) { + for(Integer idx : labelcols) { labelIndices.set(idx.intValue()); } } @@ -417,7 +442,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract @Override protected NumberVectorLabelParser<V> makeInstance() { - return new NumberVectorLabelParser<>(colSep, quoteChar, comment, labelIndices, factory); + return new NumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, factory); } } } |