package de.lmu.ifi.dbs.elki.datasource.parser; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import gnu.trove.list.array.TDoubleArrayList; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** *

* Provides a parser for parsing one point per line, attributes separated by * whitespace. *

*

* Several labels may be given per point. A label must not be parseable as * double. Lines starting with "#" will be ignored. *

*

* An index can be specified to identify an entry to be treated as class label. * This index counts all entries (numeric and labels as well) starting with 0. *

* * @author Arthur Zimek * * @apiviz.landmark * @apiviz.has NumberVector * * @param the type of NumberVector used */ public class NumberVectorLabelParser> extends AbstractStreamingParser { /** * Logging class. */ private static final Logging LOG = Logging.getLogger(NumberVectorLabelParser.class); /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices; /** * Vector factory class. */ protected NumberVector.Factory factory; /** * Buffer reader. */ private BufferedReader reader; /** * Current line number. */ protected int lineNumber; /** * Dimensionality reported. */ protected int mindim, maxdim; /** * Metadata. */ protected BundleMeta meta = null; /** * Column names. */ protected List columnnames = null; /** * Bitset to indicate which columns are not numeric. */ protected BitSet labelcolumns = null; /** * Whether or not the data set has labels. */ protected boolean haslabels = false; /** * Current vector. */ protected V curvec = null; /** * Current labels. */ protected LabelList curlbl = null; /** * (Reused) store for numerical attributes. */ final TDoubleArrayList attributes = new TDoubleArrayList(); /** * (Reused) store for labels. */ final ArrayList labels = new ArrayList<>(); /** * For String unification. */ HashMap unique = new HashMap<>(); /** * Event to report next. */ Event nextevent = null; /** * Constructor with defaults. * * @param factory Vector factory */ public NumberVectorLabelParser(NumberVector.Factory factory) { this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory); } /** * Constructor. * * @param colSep Column separator * @param quoteChars Quote character * @param comment Comment pattern * @param labelIndices Column indexes that are numeric. * @param factory Vector factory */ public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory factory) { super(colSep, quoteChars, comment); this.labelIndices = labelIndices; this.factory = factory; } @Override public void initStream(InputStream in) { reader = new BufferedReader(new InputStreamReader(in)); lineNumber = 1; mindim = Integer.MAX_VALUE; maxdim = 0; columnnames = null; haslabels = false; labelcolumns = new BitSet(); if(labelIndices != null) { labelcolumns.or(labelIndices); } } @Override public BundleMeta getMeta() { return meta; } @Override public Event nextEvent() { if(nextevent != null) { Event ret = nextevent; nextevent = null; return ret; } try { for(String line; (line = reader.readLine()) != null; lineNumber++) { // Skip empty lines and comments if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { continue; } parseLineInternal(line); // Maybe a header column? if(curvec == null) { continue; } final int curdim = curvec.getDimensionality(); if(curdim > maxdim || mindim > curdim) { mindim = Math.min(mindim, curdim); maxdim = Math.max(maxdim, curdim); buildMeta(); nextevent = Event.NEXT_OBJECT; return Event.META_CHANGED; } else if(curlbl != null && meta != null && meta.size() == 1) { buildMeta(); nextevent = Event.NEXT_OBJECT; return Event.META_CHANGED; } return Event.NEXT_OBJECT; } reader.close(); reader = null; unique.clear(); return Event.END_OF_STREAM; } catch(IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } } /** * Update the meta element. */ protected void buildMeta() { if(haslabels) { meta = new BundleMeta(2); meta.add(getTypeInformation(mindim, maxdim)); meta.add(TypeUtil.LABELLIST); } else { meta = new BundleMeta(1); meta.add(getTypeInformation(mindim, maxdim)); } } @Override public Object data(int rnum) { if(rnum == 0) { return curvec; } if(rnum == 1) { return curlbl; } throw new ArrayIndexOutOfBoundsException(); } /** * Internal method for parsing a single line. Used by both line based parsing * as well as block parsing. This saves the building of meta data for each * line. * * @param line Line to process */ protected void parseLineInternal(String line) { attributes.reset(); labels.clear(); // Split into numerical attributes and labels int i = 0; for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) { if(labelIndices == null || !labelIndices.get(i)) { try { double attribute = tokenizer.getDouble(); attributes.add(attribute); continue; } catch(NumberFormatException e) { // Ignore attempt, add to labels below. labelcolumns.set(i); } } // Else: labels. haslabels = true; final String lbl = tokenizer.getSubstring(); String u = unique.get(lbl); if(u == null) { u = lbl; unique.put(u, u); } labels.add(u); } // Maybe a label row? if(lineNumber == 1 && attributes.size() == 0) { columnnames = new ArrayList<>(labels); labelcolumns.clear(); if(labelIndices != null) { labelcolumns.or(labelIndices); } curvec = null; curlbl = null; haslabels = false; return; } // Pass outside via class variables curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER); curlbl = LabelList.make(labels); } /** * Creates a database object of type V. * * @param attributes the attributes of the vector to create. * @param adapter Array adapter * @param attribute type * @return a RalVector of type V containing the given attribute values */ protected V createDBObject(A attributes, NumberArrayAdapter adapter) { return factory.newNumberVector(attributes, adapter); } /** * Get a prototype object for the given dimensionality. * * @param mindim Minimum dimensionality * @param maxdim Maximum dimensionality * @return Prototype object */ SimpleTypeInformation getTypeInformation(int mindim, int maxdim) { if(mindim == maxdim) { String[] colnames = null; if(columnnames != null) { if(columnnames.size() - labelcolumns.cardinality() == mindim) { colnames = new String[mindim]; for(int i = 0, j = 0; i < columnnames.size(); i++) { if(!labelcolumns.get(i)) { colnames[j] = columnnames.get(i); j++; } } } } return new VectorFieldTypeInformation<>(factory, mindim, colnames); } else if(mindim < maxdim) { // Variable dimensionality - return non-vector field type return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim); } else { throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); } } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractParser.Parameterizer { /** * A comma separated list of the indices of labels (may be numeric), * counting whitespace separated entries in a line starting with 0. The * corresponding entries will be treated as a label. *

* Key: {@code -parser.labelIndices} *

*/ public static final OptionID LABEL_INDICES_ID = new OptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label."); /** * Parameter to specify the type of vectors to produce. *

* Key: {@code -parser.vector-type}
* Default: DoubleVector *

*/ public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes."); /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices = null; /** * Factory object. */ protected NumberVector.Factory factory; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); getLabelIndices(config); getFactory(config); } /** * Get the object factory. * * @param config Parameterization */ protected void getFactory(Parameterization config) { ObjectParameter> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class); if(config.grab(factoryP)) { factory = factoryP.instantiateClass(config); } } /** * Get the label indices. * * @param config Parameterization */ protected void getLabelIndices(Parameterization config) { IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true); if(config.grab(labelIndicesP)) { labelIndices = new BitSet(); List labelcols = labelIndicesP.getValue(); for(Integer idx : labelcols) { labelIndices.set(idx.intValue()); } } } @Override protected NumberVectorLabelParser makeInstance() { return new NumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, factory); } } }