package de.lmu.ifi.dbs.elki.datasource.parser;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
/**
*
* Provides a parser for parsing one point per line, attributes separated by
* whitespace.
*
*
* Several labels may be given per point. A label must not be parseable as
* double. Lines starting with "#" will be ignored.
*
*
* An index can be specified to identify an entry to be treated as class label.
* This index counts all entries (numeric and labels as well) starting with 0.
*
*
* @author Arthur Zimek
* @param the type of NumberVector used
*/
public abstract class NumberVectorLabelParser> extends AbstractParser implements LinebasedParser, Parser {
/**
* A comma separated list of the indices of labels (may be numeric), counting
* whitespace separated entries in a line starting with 0. The corresponding
* entries will be treated as a label.
*
* Key: {@code -parser.labelIndices}
*
*/
public static final OptionID LABEL_INDICES_ID = OptionID.getOrCreateOptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label.");
/**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices;
/**
* Constructor
*
* @param colSep
* @param quoteChar
* @param labelIndices
*/
public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
super(colSep, quoteChar);
this.labelIndices = labelIndices;
}
@Override
public MultipleObjectsBundle parse(InputStream in) {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
int lineNumber = 1;
int dimensionality = -1;
List vectors = new ArrayList();
List labels = new ArrayList();
try {
for(String line; (line = reader.readLine()) != null; lineNumber++) {
if(!line.startsWith(COMMENT) && line.length() > 0) {
Pair objectAndLabels = parseLineInternal(line);
if(dimensionality < 0) {
dimensionality = objectAndLabels.getFirst().getDimensionality();
}
else if(dimensionality != objectAndLabels.getFirst().getDimensionality()) {
throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ":" + objectAndLabels.getFirst().getDimensionality() + " != " + dimensionality);
}
vectors.add(objectAndLabels.first);
labels.add(objectAndLabels.second);
}
}
}
catch(IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels);
}
@Override
public SingleObjectBundle parseLine(String line) {
Pair objectAndLabels = parseLineInternal(line);
SingleObjectBundle pkg = new SingleObjectBundle();
pkg.append(getTypeInformation(objectAndLabels.first.getDimensionality()), objectAndLabels.first);
pkg.append(TypeUtil.LABELLIST, objectAndLabels.second);
return pkg;
}
/**
* Internal method for parsing a single line. Used by both line based parsig
* as well as block parsing. This saves the building of meta data for each
* line.
*
* @param line Line to process
* @return parsing result
*/
protected Pair parseLineInternal(String line) {
List entries = tokenize(line);
// Split into numerical attributes and labels
List attributes = new ArrayList(entries.size());
LabelList labels = new LabelList();
Iterator itr = entries.iterator();
for(int i = 0; itr.hasNext(); i++) {
String ent = itr.next();
if(!labelIndices.get(i)) {
try {
Double attribute = Double.valueOf(ent);
attributes.add(attribute);
}
catch(NumberFormatException e) {
labels.add(ent);
}
}
else {
labels.add(ent);
}
}
Pair objectAndLabels;
V vec = createDBObject(attributes);
objectAndLabels = new Pair(vec, labels);
return objectAndLabels;
}
/**
*
* Creates a database object of type V.
*
*
* @param attributes the attributes of the vector to create.
* @return a RalVector of type V containing the given attribute values
*/
protected abstract V createDBObject(List attributes);
/**
* Get a prototype object for the given dimensionality.
*
* @param dimensionality Dimensionality
* @return Prototype object
*/
abstract protected VectorFieldTypeInformation getTypeInformation(int dimensionality);
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static abstract class Parameterizer> extends AbstractParser.Parameterizer {
/**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices = null;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true);
labelIndices = new BitSet();
if(config.grab(labelIndicesP)) {
List labelcols = labelIndicesP.getValue();
for(Integer idx : labelcols) {
labelIndices.set(idx);
}
}
}
@Override
protected abstract NumberVectorLabelParser makeInstance();
}
}