package de.lmu.ifi.dbs.elki.datasource.parser;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import gnu.trove.iterator.TIntObjectIterator;
import gnu.trove.map.hash.TIntFloatHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.ClassLabel;
import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.ExternalID;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
/**
* Parser to load WEKA .arff files into ELKI.
*
* This parser is quite hackish, and contains lots of not yet configurable
* magic.
*
* TODO: Sparse vectors are not yet fully supported.
*
* @author Erich Schubert
*/
public class ArffParser implements Parser {
/**
* Logger.
*/
private static final Logging LOG = Logging.getLogger(ArffParser.class);
/**
* Arff file marker.
*/
public static final Pattern ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE);
/**
* Arff attribute declaration marker.
*/
public static final Pattern ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE);
/**
* Arff data marker.
*/
public static final Pattern ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE);
/**
* Comment pattern.
*/
public static final Pattern ARFF_COMMENT = Pattern.compile("^\\s*%.*");
/**
* Pattern to auto-convert columns to external ids.
*/
public static final String DEFAULT_ARFF_MAGIC_EID = "(External-?ID)";
/**
* Pattern to auto-convert columns to class labels.
*/
public static final String DEFAULT_ARFF_MAGIC_CLASS = "(Class|Class-?Label)";
/**
* Pattern for numeric columns.
*/
public static final Pattern ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE);
/**
* Empty line pattern.
*/
public static final Pattern EMPTY = Pattern.compile("^\\s*$");
/**
* Pattern to recognize external ids.
*/
Pattern magic_eid;
/**
* Pattern to recognize class label columns.
*/
Pattern magic_class;
/**
* Constructor.
*
* @param magic_eid Magic to recognize external IDs
* @param magic_class Magic to recognize class labels
*/
public ArffParser(Pattern magic_eid, Pattern magic_class) {
super();
this.magic_eid = magic_eid;
this.magic_class = magic_class;
}
/**
* Constructor.
*
* @param magic_eid Magic to recognize external IDs
* @param magic_class Magic to recognize class labels
*/
public ArffParser(String magic_eid, String magic_class) {
this(Pattern.compile(magic_eid, Pattern.CASE_INSENSITIVE), Pattern.compile(magic_class, Pattern.CASE_INSENSITIVE));
}
@Override
public MultipleObjectsBundle parse(InputStream instream) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(instream));
ArrayList names = new ArrayList();
ArrayList types = new ArrayList();
readHeader(br);
parseAttributeStatements(br, names, types);
// Convert into column mapping. Prepare arrays to fill
int[] targ = new int[names.size()];
TypeInformation[] elkitypes = new TypeInformation[names.size()];
int[] dimsize = new int[names.size()];
processColumnTypes(names, types, targ, elkitypes, dimsize);
// Prepare bundle:
// This is a bit complicated to produce vector fields.
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
StreamTokenizer tokenizer = makeArffTokenizer(br);
int state = 0;
nextToken(tokenizer);
while(tokenizer.ttype != StreamTokenizer.TT_EOF) {
// Parse instance
if(tokenizer.ttype == StreamTokenizer.TT_EOL) {
// ignore empty lines
}
else if(tokenizer.ttype != '{') {
if(state == 0) {
setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false);
state = 1; // dense
}
if(state != 1) {
throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
}
// Load a dense instance
bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength()));
}
else {
if(state == 0) {
setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true);
state = 2; // dense
}
if(state != 2) {
throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
}
bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength()));
}
if(tokenizer.ttype != StreamTokenizer.TT_EOF) {
nextToken(tokenizer);
}
}
return bundle;
}
catch(IOException e) {
throw new AbortException("IO error in parser", e);
}
}
private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
// logger.warning("Sparse instance.");
TIntObjectHashMap