diff options
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java | 688 |
1 files changed, 688 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java new file mode 100644 index 00000000..515c68c4 --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java @@ -0,0 +1,688 @@ +package de.lmu.ifi.dbs.elki.datasource.parser; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.iterator.TIntObjectIterator; +import gnu.trove.map.hash.TIntDoubleHashMap; +import gnu.trove.map.hash.TIntObjectHashMap; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.StreamTokenizer; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.ExternalID; +import de.lmu.ifi.dbs.elki.data.LabelList; +import de.lmu.ifi.dbs.elki.data.SimpleClassLabel; +import de.lmu.ifi.dbs.elki.data.SparseDoubleVector; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Parser to load WEKA .arff files into ELKI. + * + * This parser is quite hackish, and contains lots of not yet configurable + * magic. + * + * TODO: Allow configuration of the vector types (double, float) + * + * TODO: when encountering integer columns, produce integer vectors. + * + * TODO: allow optional class labels. + * + * @author Erich Schubert + */ +public class ArffParser implements Parser { + /** + * Logger. + */ + private static final Logging LOG = Logging.getLogger(ArffParser.class); + + /** + * Arff file marker. + */ + public static final Matcher ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE).matcher(""); + + /** + * Arff attribute declaration marker. + */ + public static final Matcher ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE).matcher(""); + + /** + * Arff data marker. + */ + public static final Matcher ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE).matcher(""); + + /** + * Comment pattern. + */ + public static final Matcher ARFF_COMMENT = Pattern.compile("^\\s*%.*").matcher(""); + + /** + * Pattern to auto-convert columns to external ids. + */ + public static final String DEFAULT_ARFF_MAGIC_EID = "(External-?ID)"; + + /** + * Pattern to auto-convert columns to class labels. + */ + public static final String DEFAULT_ARFF_MAGIC_CLASS = "(Class|Class-?Label)"; + + /** + * Pattern for numeric columns. + */ + public static final Matcher ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE).matcher(""); + + /** + * Empty line pattern. + */ + public static final Matcher EMPTY = Pattern.compile("^\\s*$").matcher(""); + + /** + * Pattern to recognize external ids. + */ + Matcher magic_eid; + + /** + * Pattern to recognize class label columns. + */ + Matcher magic_class; + + /** + * (Reused) buffer for building label lists. + */ + ArrayList<String> labels = new ArrayList<>(); + + /** + * Constructor. + * + * @param magic_eid Magic to recognize external IDs + * @param magic_class Magic to recognize class labels + */ + public ArffParser(Pattern magic_eid, Pattern magic_class) { + super(); + this.magic_eid = magic_eid.matcher(""); + this.magic_class = magic_class.matcher(""); + } + + /** + * Constructor. + * + * @param magic_eid Magic to recognize external IDs + * @param magic_class Magic to recognize class labels + */ + public ArffParser(String magic_eid, String magic_class) { + this(Pattern.compile(magic_eid, Pattern.CASE_INSENSITIVE), Pattern.compile(magic_class, Pattern.CASE_INSENSITIVE)); + } + + @Override + public MultipleObjectsBundle parse(InputStream instream) { + try { + BufferedReader br = new BufferedReader(new InputStreamReader(instream)); + ArrayList<String> names = new ArrayList<>(); + ArrayList<String> types = new ArrayList<>(); + + readHeader(br); + parseAttributeStatements(br, names, types); + + // Convert into column mapping. Prepare arrays to fill + int[] targ = new int[names.size()]; + TypeInformation[] elkitypes = new TypeInformation[names.size()]; + int[] dimsize = new int[names.size()]; + processColumnTypes(names, types, targ, elkitypes, dimsize); + + // Prepare bundle: + // This is a bit complicated to produce vector fields. + MultipleObjectsBundle bundle = new MultipleObjectsBundle(); + StreamTokenizer tokenizer = makeArffTokenizer(br); + + int state = 0; + + nextToken(tokenizer); + while(tokenizer.ttype != StreamTokenizer.TT_EOF) { + // Parse instance + if(tokenizer.ttype == StreamTokenizer.TT_EOL) { + // ignore empty lines + } + else if(tokenizer.ttype != '{') { + if(state == 0) { + setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false); + state = 1; // dense + } + if(state != 1) { + throw new AbortException("Mixing dense and sparse vectors is currently not allowed."); + } + // Load a dense instance + bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength())); + } + else { + if(state == 0) { + setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true); + state = 2; // dense + } + if(state != 2) { + throw new AbortException("Mixing dense and sparse vectors is currently not allowed."); + } + bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength())); + } + if(tokenizer.ttype != StreamTokenizer.TT_EOF) { + nextToken(tokenizer); + } + } + return bundle; + } + catch(IOException e) { + throw new AbortException("IO error in parser", e); + } + } + + private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException { + // logger.warning("Sparse instance."); + TIntObjectHashMap<Object> map = new TIntObjectHashMap<>(); + while(true) { + nextToken(tokenizer); + assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL); + if(tokenizer.ttype == '}') { + nextToken(tokenizer); + assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL); + break; + } + else { + // sparse token + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype); + } + int dim = Integer.valueOf(tokenizer.sval); + if(map.containsKey(dim)) { + throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString()); + } + nextToken(tokenizer); + if(tokenizer.ttype == StreamTokenizer.TT_WORD) { + if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]])) { + map.put(dim, FormatUtil.parseDouble(tokenizer.sval)); + } + else { + map.put(dim, tokenizer.sval); + } + } + else { + throw new AbortException("Unexpected token type encountered: " + tokenizer.toString()); + } + } + } + Object[] data = new Object[metaLength]; + for(int out = 0; out < metaLength; out++) { + // Find the first index + int s = -1; + for(int i = 0; i < targ.length; i++) { + if(targ[i] == out && s < 0) { + s = i; + break; + } + } + assert (s >= 0); + if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) { + TIntDoubleHashMap f = new TIntDoubleHashMap(dimsize[out]); + for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) { + iter.advance(); + int i = iter.key(); + if(i < s) { + continue; + } + if(i >= s + dimsize[out]) { + break; + } + double v = ((Double) iter.value()).doubleValue(); + f.put(i - s, v); + } + data[out] = new SparseDoubleVector(f, dimsize[out]); + } + else if(TypeUtil.LABELLIST.equals(elkitypes[out])) { + // Build a label list out of successive labels + labels.clear(); + for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) { + iter.advance(); + int i = iter.key(); + if(i < s) { + continue; + } + if(i >= s + dimsize[out]) { + break; + } + String v = (String) iter.value(); + if(labels.size() < i - s) { + LOG.warning("Sparse consecutive labels are currently not correctly supported."); + } + labels.add(v); + } + data[out] = LabelList.make(labels); + } + else if(TypeUtil.EXTERNALID.equals(elkitypes[out])) { + String val = (String) map.get(s); + if(val != null) { + data[out] = new ExternalID(val); + } + else { + throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString()); + } + } + else if(TypeUtil.CLASSLABEL.equals(elkitypes[out])) { + Object val = map.get(s); + if(val != null) { + // TODO: support other class label types. + ClassLabel lbl = new SimpleClassLabel(String.valueOf(val)); + data[out] = lbl; + } + else { + throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString()); + } + } + else { + throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null")); + } + } + return data; + } + + private Object[] loadDenseInstance(StreamTokenizer tokenizer, int[] dimsize, TypeInformation[] etyp, int outdim) throws IOException { + Object[] data = new Object[outdim]; + for(int out = 0; out < outdim; out++) { + if(TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) { + // For multi-column vectors, read successive columns + double[] cur = new double[dimsize[out]]; + for(int k = 0; k < dimsize[out]; k++) { + if(tokenizer.ttype == '?') { + tokenizer.nval = Double.NaN; + } + else if(tokenizer.ttype == StreamTokenizer.TT_WORD) { + try { + cur[k] = FormatUtil.parseDouble(tokenizer.sval); + } + catch(NumberFormatException e) { + throw new AbortException("Expected number value, got: " + tokenizer.sval); + } + } + else { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + nextToken(tokenizer); + } + data[out] = new DoubleVector(cur); + } + else if(TypeUtil.LABELLIST.equals(etyp[out])) { + // Build a label list out of successive labels + labels.clear(); + for(int k = 0; k < dimsize[out]; k++) { + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + labels.add(tokenizer.sval); + nextToken(tokenizer); + } + data[out] = LabelList.make(labels); + } + else if(TypeUtil.EXTERNALID.equals(etyp[out])) { + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + data[out] = new ExternalID(tokenizer.sval); + nextToken(tokenizer); + } + else if(TypeUtil.CLASSLABEL.equals(etyp[out])) { + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + // TODO: support other class label types. + ClassLabel lbl = new SimpleClassLabel(tokenizer.sval); + data[out] = lbl; + nextToken(tokenizer); + } + else { + throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null")); + } + } + return data; + } + + /** + * Make a StreamTokenizer for the ARFF format. + * + * @param br Buffered reader + * @return Tokenizer + */ + private StreamTokenizer makeArffTokenizer(BufferedReader br) { + // Setup tokenizer + StreamTokenizer tokenizer = new StreamTokenizer(br); + { + tokenizer.resetSyntax(); + tokenizer.whitespaceChars(0, ' '); + tokenizer.ordinaryChars('0', '9'); // Do not parse numbers + tokenizer.ordinaryChar('-'); + tokenizer.ordinaryChar('.'); + tokenizer.wordChars(' ' + 1, '\u00FF'); + tokenizer.whitespaceChars(',', ','); + tokenizer.commentChar('%'); + tokenizer.quoteChar('"'); + tokenizer.quoteChar('\''); + tokenizer.ordinaryChar('{'); + tokenizer.ordinaryChar('}'); + tokenizer.eolIsSignificant(true); + } + return tokenizer; + } + + /** + * Setup the headers for the object bundle. + * + * @param names Attribute names + * @param targ Target columns + * @param etyp ELKI type information + * @param dimsize Number of dimensions in the individual types + * @param bundle Output bundle + * @param sparse Flag to create sparse vectors + */ + private void setupBundleHeaders(ArrayList<String> names, int[] targ, TypeInformation[] etyp, int[] dimsize, MultipleObjectsBundle bundle, boolean sparse) { + for(int in = 0, out = 0; in < targ.length; out++) { + int nin = in + 1; + for(; nin < targ.length; nin++) { + if(targ[nin] != targ[in]) { + break; + } + } + if(TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) { + String[] labels = new String[dimsize[out]]; + // Collect labels: + for(int i = 0; i < dimsize[out]; i++) { + labels[i] = names.get(out + i); + } + if(!sparse) { + VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels); + bundle.appendColumn(type, new ArrayList<DoubleVector>()); + } + else { + VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels); + bundle.appendColumn(type, new ArrayList<SparseDoubleVector>()); + } + } + else if(TypeUtil.LABELLIST.equals(etyp[out])) { + StringBuilder label = new StringBuilder(names.get(out)); + for(int i = 1; i < dimsize[out]; i++) { + label.append(' ').append(names.get(out + i)); + } + bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>()); + } + else if(TypeUtil.EXTERNALID.equals(etyp[out])) { + bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>()); + } + else if(TypeUtil.CLASSLABEL.equals(etyp[out])) { + bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>()); + } + else { + throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null")); + } + assert (out == bundle.metaLength() - 1); + in = nin; + } + } + + /** + * Read the dataset header part of the ARFF file, to ensure consistency. + * + * @param br Buffered Reader + * @throws IOException + */ + private void readHeader(BufferedReader br) throws IOException { + String line; + // Locate header line + while(true) { + line = br.readLine(); + if(line == null) { + throw new AbortException(ARFF_HEADER_RELATION + " not found in file."); + } + // Skip comments and empty lines + if(ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) { + continue; + } + // Break on relation statement + if(ARFF_HEADER_RELATION.reset(line).matches()) { + break; + } + throw new AbortException("Expected relation declaration: " + line); + } + } + + /** + * Parse the "@attribute" section of the ARFF file. + * + * @param br Input + * @param names List (to fill) of attribute names + * @param types List (to fill) of attribute types + * @throws IOException + */ + private void parseAttributeStatements(BufferedReader br, ArrayList<String> names, ArrayList<String> types) throws IOException { + String line; + // Load attribute metadata + while(true) { + line = br.readLine(); + if(line == null) { + throw new AbortException(ARFF_HEADER_DATA + " not found in file."); + } + // Skip comments and empty lines + if(ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) { + continue; + } + // Break on data statement to continue + if(ARFF_HEADER_DATA.reset(line).matches()) { + break; + } + // Expect an attribute specification + Matcher matcher = ARFF_HEADER_ATTRIBUTE.reset(line); + if(matcher.matches()) { + String name = matcher.group(1); + if(name.charAt(0) == '\'' && name.charAt(name.length() - 1) == '\'') { + name = name.substring(1, name.length() - 1); + } + else if(name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') { + name = name.substring(1, name.length() - 1); + } + String type = matcher.group(2); + names.add(name); + types.add(type); + // logger.warning("Attribute name: " + name + " type: " + type); + continue; + } + throw new AbortException("Unrecognized line: " + line); + } + assert (names.size() == types.size()); + } + + /** + * Process the column types (and names!) into ELKI relation style. Note that + * this will for example merge successive numerical columns into a single + * vector. + * + * @param names Attribute names + * @param types Attribute types + * @param targ Target dimension mapping (ARFF to ELKI), return value + * @param etyp ELKI type information, return value + * @param dims Number of successive dimensions, return value + */ + private void processColumnTypes(ArrayList<String> names, ArrayList<String> types, int[] targ, TypeInformation[] etyp, int[] dims) { + int next = 0; + for(int i = 0; i < targ.length; i++) { + if(magic_eid != null && magic_eid.reset(names.get(i)).matches()) { + // Turn into an external ID column. + targ[i] = next; + etyp[next] = TypeUtil.EXTERNALID; + dims[next] = 1; + next++; + continue; + } + else if(magic_class != null && magic_class.reset(names.get(i)).matches()) { + // Type as ClassLabel + targ[i] = next; + etyp[next] = TypeUtil.CLASSLABEL; + dims[next] = 1; + next++; + continue; + } + else if(ARFF_NUMERIC.reset(types.get(i)).matches()) { + // Create a number vector field + if(next > 0 && TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[next - 1])) { + targ[i] = next - 1; + dims[next - 1]++; + continue; + } + else { + targ[i] = next; + etyp[next] = TypeUtil.NUMBER_VECTOR_FIELD; + dims[next] = 1; + next++; + continue; + } + } + else { + // Use LabelList + if(next > 0 && TypeUtil.LABELLIST.equals(etyp[next - 1])) { + targ[i] = next - 1; + dims[next - 1]++; + continue; + } + else { + targ[i] = next; + etyp[next] = TypeUtil.LABELLIST; + dims[next] = 1; + next++; + continue; + } + } + } + } + + /** + * Helper function for token handling. + * + * @param tokenizer Tokenizer + * @throws IOException + */ + private void nextToken(StreamTokenizer tokenizer) throws IOException { + tokenizer.nextToken(); + if((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) { + tokenizer.ttype = StreamTokenizer.TT_WORD; + } + else if((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) { + tokenizer.ttype = '?'; + } + if(LOG.isDebugging()) { + if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) { + LOG.debug("token: " + tokenizer.nval); + } + else if(tokenizer.ttype == StreamTokenizer.TT_WORD) { + LOG.debug("token: " + tokenizer.sval); + } + else if(tokenizer.ttype == StreamTokenizer.TT_EOF) { + LOG.debug("token: EOF"); + } + else if(tokenizer.ttype == StreamTokenizer.TT_EOL) { + LOG.debug("token: EOL"); + } + else { + LOG.debug("token type: " + tokenizer.ttype); + } + } + } + + @Override + public void cleanup() { + if (magic_eid != null) { + magic_eid.reset(""); + } + if (magic_class != null) { + magic_class.reset(""); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Pattern for recognizing external ID attributes. + */ + public static final OptionID MAGIC_EID_ID = new OptionID("arff.externalid", "Pattern to recognize external ID attributes."); + + /** + * Pattern for recognizing class label attributes. + */ + public static final OptionID MAGIC_CLASS_ID = new OptionID("arff.classlabel", "Pattern to recognize class label attributes."); + + /** + * Pattern to recognize external ids + */ + Pattern magic_eid; + + /** + * Pattern to recognize class label columns + */ + Pattern magic_class; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter eidP = new PatternParameter(MAGIC_EID_ID, DEFAULT_ARFF_MAGIC_EID); + if(config.grab(eidP)) { + magic_eid = eidP.getValue(); + } + PatternParameter classP = new PatternParameter(MAGIC_CLASS_ID, DEFAULT_ARFF_MAGIC_CLASS); + if(config.grab(classP)) { + magic_class = classP.getValue(); + } + } + + @Override + protected ArffParser makeInstance() { + return new ArffParser(magic_eid, magic_class); + } + } +}
\ No newline at end of file |