diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser')
15 files changed, 652 insertions, 266 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java index 3e3ff954..e3bd77a0 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.util.ArrayList; import java.util.List; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java new file mode 100644 index 00000000..9710b493 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java @@ -0,0 +1,371 @@ +package de.lmu.ifi.dbs.elki.datasource.parser; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.StreamTokenizer; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.ExternalID; +import de.lmu.ifi.dbs.elki.data.LabelList; +import de.lmu.ifi.dbs.elki.data.SimpleClassLabel; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; + +/** + * Parser to load WEKA .arff files into ELKI. + * + * This parser is quite hackish, and contains lots of not yet configurable + * magic. + * + * TODO: Sparse vectors are not yet supported. + * + * @author Erich Schubert + */ +public class ArffParser implements Parser { + /** + * Logger + */ + private static final Logging logger = Logging.getLogger(ArffParser.class); + + /** + * Arff file marker + */ + public static final Pattern ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE); + + /** + * Arff attribute declaration marker + */ + public static final Pattern ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE); + + /** + * Arff data marker + */ + public static final Pattern ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE); + + /** + * Comment pattern. + */ + public static final Pattern ARFF_COMMENT = Pattern.compile("^\\s*%.*"); + + /** + * Pattern to auto-convert columns to external ids. + */ + public static final Pattern ARFF_MAGIC_EID = Pattern.compile("(ID|External-?ID)", Pattern.CASE_INSENSITIVE); + + /** + * Pattern to auto-convert columns to class labels. + */ + public static final Pattern ARFF_MAGIC_CLASS = Pattern.compile("(Class|Class-?Label)", Pattern.CASE_INSENSITIVE); + + /** + * Pattern for numeric columns + */ + public static final Pattern ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE); + + /** + * Empty line pattern. + */ + public static final Pattern EMPTY = Pattern.compile("^\\s*$"); + + @Override + public MultipleObjectsBundle parse(InputStream instream) { + try { + BufferedReader br = new BufferedReader(new InputStreamReader(instream)); + String line; + // Locate header line + while(true) { + line = br.readLine(); + if(line == null) { + throw new AbortException(ARFF_HEADER_RELATION + " not found in file."); + } + // Skip comments and empty lines + if(ARFF_COMMENT.matcher(line).matches() || EMPTY.matcher(line).matches()) { + continue; + } + // Break on relation statement + if(ARFF_HEADER_RELATION.matcher(line).matches()) { + break; + } + throw new AbortException("Expected relation declaration: " + line); + } + ArrayList<String> names = new ArrayList<String>(); + ArrayList<String> types = new ArrayList<String>(); + // Load attribute metadata + while(true) { + line = br.readLine(); + if(line == null) { + throw new AbortException(ARFF_HEADER_DATA + " not found in file."); + } + // Skip comments and empty lines + if(ARFF_COMMENT.matcher(line).matches() || EMPTY.matcher(line).matches()) { + continue; + } + // Break on data statement to continue + if(ARFF_HEADER_DATA.matcher(line).matches()) { + break; + } + // Expect an attribute specification + Matcher matcher = ARFF_HEADER_ATTRIBUTE.matcher(line); + if(matcher.matches()) { + String name = matcher.group(1); + if(name.charAt(0) == '\'' && name.charAt(name.length() - 1) == '\'') { + name = name.substring(1, name.length() - 1); + } + else if(name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') { + name = name.substring(1, name.length() - 1); + } + String type = matcher.group(2); + names.add(name); + types.add(type); + // logger.warning("Attribute name: " + name + " type: " + type); + continue; + } + throw new AbortException("Unrecognized line: " + line); + } + assert (names.size() == types.size()); + + int[] targ = new int[names.size()]; + TypeInformation[] etyp = new TypeInformation[names.size()]; + int[] dims = new int[names.size()]; + + int next = 0; + for(int i = 0; i < targ.length; i++) { + // Turn into an external ID column. + if(ARFF_MAGIC_EID.matcher(names.get(i)).matches()) { + targ[i] = next; + etyp[next] = TypeUtil.EXTERNALID; + dims[next] = 1; + next++; + continue; + } + else if(ARFF_MAGIC_CLASS.matcher(names.get(i)).matches()) { + targ[i] = next; + etyp[next] = TypeUtil.CLASSLABEL; + dims[next] = 1; + next++; + continue; + } + else if(ARFF_NUMERIC.matcher(types.get(i)).matches()) { + if(next > 0 && etyp[next - 1] == TypeUtil.NUMBER_VECTOR_FIELD) { + targ[i] = next - 1; + dims[next - 1]++; + continue; + } + else { + targ[i] = next; + etyp[next] = TypeUtil.NUMBER_VECTOR_FIELD; + dims[next] = 1; + next++; + continue; + } + } + else { + if(next > 0 && etyp[next - 1] == TypeUtil.LABELLIST) { + targ[i] = next - 1; + dims[next - 1]++; + continue; + } + else { + targ[i] = next; + etyp[next] = TypeUtil.LABELLIST; + dims[next] = 1; + next++; + continue; + } + } + } + + // Prepare bundle: + // This is a bit complicated to produce vector fields. + MultipleObjectsBundle bundle = new MultipleObjectsBundle(); + for(int in = 0, out = 0; in < targ.length; out++) { + int nin = in + 1; + for(; nin < targ.length; nin++) { + if(targ[nin] != targ[in]) { + break; + } + } + if(etyp[out] == TypeUtil.NUMBER_VECTOR_FIELD) { + String[] labels = new String[dims[out]]; + // Collect labels: + for(int i = 0; i < dims[out]; i++) { + labels[i] = names.get(out + i); + } + VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dims[out], labels, new DoubleVector(new double[dims[out]])); + bundle.appendColumn(type, new ArrayList<DoubleVector>()); + } + else if(etyp[out] == TypeUtil.LABELLIST) { + String label = names.get(out); + for(int i = 1; i < dims[out]; i++) { + label = label + " " + names.get(out + i); + } + bundle.appendColumn(new SimpleTypeInformation<LabelList>(LabelList.class, label), new ArrayList<LabelList>()); + } + else if(etyp[out] == TypeUtil.EXTERNALID) { + bundle.appendColumn(new SimpleTypeInformation<ExternalID>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>()); + } + else if(etyp[out] == TypeUtil.CLASSLABEL) { + bundle.appendColumn(new SimpleTypeInformation<ClassLabel>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>()); + } + else { + throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null")); + } + assert (out == bundle.metaLength() - 1); + // logger.warning("Added meta: " + bundle.meta(bundle.metaLength() - + // 1)); + in = nin; + } + // Setup tokenizer + StreamTokenizer tokenizer = new StreamTokenizer(br); + { + tokenizer.whitespaceChars(0, ' '); + tokenizer.wordChars(' ' + 1, '\u00FF'); + tokenizer.whitespaceChars(',', ','); + tokenizer.commentChar('%'); + tokenizer.quoteChar('"'); + tokenizer.quoteChar('\''); + tokenizer.ordinaryChar('{'); + tokenizer.ordinaryChar('}'); + tokenizer.eolIsSignificant(true); + } + + final int outdim = bundle.metaLength(); + nextToken(tokenizer); + while(tokenizer.ttype != StreamTokenizer.TT_EOF) { + // Parse instance + if(tokenizer.ttype == StreamTokenizer.TT_EOL) { + // ignore empty lines + } + else if(tokenizer.ttype != '{') { + // logger.warning("Regular instance."); + Object[] data = new Object[outdim]; + for(int out = 0; out < outdim; out++) { + if(etyp[out] == TypeUtil.NUMBER_VECTOR_FIELD) { + double[] cur = new double[dims[out]]; + for(int k = 0; k < dims[out]; k++) { + if(tokenizer.ttype != StreamTokenizer.TT_NUMBER) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + cur[k] = tokenizer.nval; + nextToken(tokenizer); + } + data[out] = new DoubleVector(cur); + } + else if(etyp[out] == TypeUtil.LABELLIST) { + LabelList ll = new LabelList(); + for(int k = 0; k < dims[out]; k++) { + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + ll.add(tokenizer.sval); + nextToken(tokenizer); + } + data[out] = ll; + } + else if(etyp[out] == TypeUtil.EXTERNALID) { + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + data[out] = new ExternalID(tokenizer.sval); + nextToken(tokenizer); + } + else if(etyp[out] == TypeUtil.CLASSLABEL) { + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Expected word token, got: " + tokenizer.toString()); + } + ClassLabel lbl = new SimpleClassLabel(); + lbl.init(tokenizer.sval); + data[out] = lbl; + nextToken(tokenizer); + } + else { + throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null")); + } + } + bundle.appendSimple(data); + } + else { + logger.warning("Sparse instance."); + while(true) { + nextToken(tokenizer); + assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL); + if(tokenizer.ttype == '}') { + nextToken(tokenizer); + assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL); + break; + } + else { + // sparse token + } + } + throw new AbortException("Sparse ARFF are not (yet) supported."); + } + if(tokenizer.ttype != StreamTokenizer.TT_EOF) { + nextToken(tokenizer); + } + } + return bundle; + } + catch(IOException e) { + throw new AbortException("IO error in parser", e); + } + } + + private void nextToken(StreamTokenizer tokenizer) throws IOException { + tokenizer.nextToken(); + if((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) { + tokenizer.ttype = StreamTokenizer.TT_WORD; + } + else if((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) { + tokenizer.ttype = '?'; + } + if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) { + logger.debug("token: " + tokenizer.nval); + } + else if(tokenizer.ttype == StreamTokenizer.TT_WORD) { + logger.debug("token: " + tokenizer.sval); + } + else if(tokenizer.ttype == StreamTokenizer.TT_EOF) { + logger.debug("token: EOF"); + } + else if(tokenizer.ttype == StreamTokenizer.TT_EOL) { + logger.debug("token: EOL"); + } + else { + logger.debug("token type: " + tokenizer.ttype); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java index 41ae5cab..cd206310 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParser.java index f9b403d6..ce8d60e1 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.InputStream; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParsingResult.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParsingResult.java index 2c1ef55e..48f50e50 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParsingResult.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DistanceParsingResult.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.util.Map; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelTransposingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelTransposingParser.java index 540dafb4..2019346c 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelTransposingParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelTransposingParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/LinebasedParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/LinebasedParser.java index 5eeab768..5a9ffe14 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/LinebasedParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/LinebasedParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle; import de.lmu.ifi.dbs.elki.utilities.InspectionUtilFrequentlyScanned; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberDistanceParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberDistanceParser.java index f07d9c68..dcb20020 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberDistanceParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberDistanceParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java index b9107cf2..a89f6c3a 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java index d2973bf4..8c4f44ae 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ParameterizationFunctionLabelParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java index e320f873..b0222b99 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.InputStream; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java index 6f2a5508..98f9888d 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; -/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; @@ -32,6 +33,7 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import de.lmu.ifi.dbs.elki.data.ExternalID; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.spatial.Polygon; import de.lmu.ifi.dbs.elki.data.spatial.PolygonsObject; @@ -40,7 +42,6 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * Parser to load polygon data (2D and 3D only) from a simple format. One record @@ -87,12 +88,14 @@ public class SimplePolygonParser extends AbstractParser implements Parser { List<PolygonsObject> polys = new ArrayList<PolygonsObject>(); List<LabelList> labels = new ArrayList<LabelList>(); + List<ExternalID> eids = new ArrayList<ExternalID>(); try { for(String line; (line = reader.readLine()) != null; lineNumber++) { if(!line.startsWith(COMMENT) && line.length() > 0) { - Pair<PolygonsObject, LabelList> objectAndLabels = parseLine(line); - polys.add(objectAndLabels.first); - labels.add(objectAndLabels.second); + Object[] objs = parseLine(line); + polys.add((PolygonsObject)objs[0]); + labels.add((LabelList)objs[1]); + eids.add((ExternalID)objs[2]); } } } @@ -100,7 +103,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } - return MultipleObjectsBundle.makeSimple(TypeUtil.POLYGON_TYPE, polys, TypeUtil.LABELLIST, labels); + return MultipleObjectsBundle.makeSimple(TypeUtil.POLYGON_TYPE, polys, TypeUtil.LABELLIST, labels, TypeUtil.EXTERNALID, eids); } /** @@ -110,7 +113,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser { * * @return Parsed polygon */ - private Pair<PolygonsObject, LabelList> parseLine(String line) { + private Object[] parseLine(String line) { List<String> entries = tokenize(line); Iterator<String> iter = entries.iterator(); @@ -153,7 +156,9 @@ public class SimplePolygonParser extends AbstractParser implements Parser { if(coords.size() > 0) { polys.add(new Polygon(coords)); } - return new Pair<PolygonsObject, LabelList>(new PolygonsObject(polys), labels); + // Use first label as eternal ID + ExternalID eid = labels.size() > 0 ? new ExternalID(labels.remove(0)) : null; + return new Object[] { new PolygonsObject(polys), labels, eid }; } @Override diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java index d29e7393..4b349f6a 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; + /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java index caa06e84..b4e7ee3e 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java @@ -95,7 +95,6 @@ public class SparseFloatVectorLabelParser extends NumberVectorLabelParser<Sparse private int dimensionality = -1;
@Override
- @SuppressWarnings("unused")
public SparseFloatVector createDBObject(List<Double> attributes) {
throw new UnsupportedOperationException("This method should never be reached.");
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java index 42db180d..8448e4c4 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java @@ -1,26 +1,27 @@ package de.lmu.ifi.dbs.elki.datasource.parser; -/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2011 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2011 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ import java.io.BufferedReader; import java.io.IOException; @@ -86,7 +87,6 @@ public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVect } @Override - @SuppressWarnings("unused") protected SparseFloatVector createDBObject(List<Double> attributes) { throw new UnsupportedOperationException("This method should never be reached."); } |