package de.lmu.ifi.dbs.elki.datasource.parser; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.BitSet; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** *

* Provides a parser for parsing one point per line, attributes separated by * whitespace. *

*

* Several labels may be given per point. A label must not be parseable as * double. Lines starting with "#" will be ignored. *

*

* An index can be specified to identify an entry to be treated as class label. * This index counts all entries (numeric and labels as well) starting with 0. *

* * @author Arthur Zimek * @param the type of NumberVector used */ public abstract class NumberVectorLabelParser> extends AbstractParser implements LinebasedParser, Parser { /** * A comma separated list of the indices of labels (may be numeric), counting * whitespace separated entries in a line starting with 0. The corresponding * entries will be treated as a label. *

* Key: {@code -parser.labelIndices} *

*/ public static final OptionID LABEL_INDICES_ID = OptionID.getOrCreateOptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label."); /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices; /** * Constructor * * @param colSep * @param quoteChar * @param labelIndices */ public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) { super(colSep, quoteChar); this.labelIndices = labelIndices; } @Override public MultipleObjectsBundle parse(InputStream in) { BufferedReader reader = new BufferedReader(new InputStreamReader(in)); int lineNumber = 1; int dimensionality = -1; List vectors = new ArrayList(); List labels = new ArrayList(); try { for(String line; (line = reader.readLine()) != null; lineNumber++) { if(!line.startsWith(COMMENT) && line.length() > 0) { Pair objectAndLabels = parseLineInternal(line); if(dimensionality < 0) { dimensionality = objectAndLabels.getFirst().getDimensionality(); } else if(dimensionality != objectAndLabels.getFirst().getDimensionality()) { throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ":" + objectAndLabels.getFirst().getDimensionality() + " != " + dimensionality); } vectors.add(objectAndLabels.first); labels.add(objectAndLabels.second); } } } catch(IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels); } @Override public SingleObjectBundle parseLine(String line) { Pair objectAndLabels = parseLineInternal(line); SingleObjectBundle pkg = new SingleObjectBundle(); pkg.append(getTypeInformation(objectAndLabels.first.getDimensionality()), objectAndLabels.first); pkg.append(TypeUtil.LABELLIST, objectAndLabels.second); return pkg; } /** * Internal method for parsing a single line. Used by both line based parsig * as well as block parsing. This saves the building of meta data for each * line. * * @param line Line to process * @return parsing result */ protected Pair parseLineInternal(String line) { List entries = tokenize(line); // Split into numerical attributes and labels List attributes = new ArrayList(entries.size()); LabelList labels = new LabelList(); Iterator itr = entries.iterator(); for(int i = 0; itr.hasNext(); i++) { String ent = itr.next(); if(!labelIndices.get(i)) { try { Double attribute = Double.valueOf(ent); attributes.add(attribute); } catch(NumberFormatException e) { labels.add(ent); } } else { labels.add(ent); } } Pair objectAndLabels; V vec = createDBObject(attributes); objectAndLabels = new Pair(vec, labels); return objectAndLabels; } /** *

* Creates a database object of type V. *

* * @param attributes the attributes of the vector to create. * @return a RalVector of type V containing the given attribute values */ protected abstract V createDBObject(List attributes); /** * Get a prototype object for the given dimensionality. * * @param dimensionality Dimensionality * @return Prototype object */ abstract protected VectorFieldTypeInformation getTypeInformation(int dimensionality); /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static abstract class Parameterizer> extends AbstractParser.Parameterizer { /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true); labelIndices = new BitSet(); if(config.grab(labelIndicesP)) { List labelcols = labelIndicesP.getValue(); for(Integer idx : labelcols) { labelIndices.set(idx); } } } @Override protected abstract NumberVectorLabelParser makeInstance(); } }