diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java new file mode 100644 index 00000000..3a7ac44a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java @@ -0,0 +1,151 @@ +package de.lmu.ifi.dbs.elki.datasource.parser; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.data.LabelList; +import de.lmu.ifi.dbs.elki.data.SparseFloatVector; +import de.lmu.ifi.dbs.elki.data.SparseNumberVector; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Parser to read libSVM format files. + * + * + * The format of libSVM is roughly specified in the README given: + * + * <pre> + * <label> <index1>:<value1> <index2>:<value2> ... + * </pre> + * + * i.e. a mandatory integer class label in the beginning followed by a classic + * sparse vector representation of the data. indexes are integers, starting at 1 + * (Note that ELKI uses 0-based indexing, so we will map these to index-1) to + * not always have a constant-0 dimension 0. + * + * The FAQ states that you can also put comments into the file, separated by a + * dash: <tt>#</tt>, but they must not contain colons and are not officially + * supported. ELKI will simply stop parsing a line when encountering a + * <tt>#</tt>. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class LibSVMFormatParser<V extends SparseNumberVector> extends SparseNumberVectorLabelParser<V> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(LibSVMFormatParser.class); + + /** + * LibSVM uses whitespace and colons for separation. + */ + public static final Pattern WHITESPACE_PATTERN = Pattern.compile("(\\s+|:)"); + + /** + * Comment pattern. + */ + public static final Pattern COMMENT_PATTERN = Pattern.compile("#"); + + /** + * Constructor. + * + * @param factory Vector factory + */ + public LibSVMFormatParser(SparseNumberVector.Factory<V> factory) { + super(WHITESPACE_PATTERN, null, COMMENT_PATTERN, null, factory); + } + + @Override + protected boolean parseLineInternal() { + /* tokenizer initialized by nextLineExceptComments() */ + int thismax = 0; + + // TODO: rely on the string being numeric for performance + // But it might be missing sometimes, or "?" + labels.add(tokenizer.getSubstring()); + tokenizer.advance(); + haslabels = true; // libSVM always has labels. + + while(tokenizer.valid()) { + try { + int index = (int) tokenizer.getLongBase10(); + tokenizer.advance(); + double attribute = tokenizer.getDouble(); + tokenizer.advance(); + thismax = Math.max(thismax, index + 1); + values.put(index, attribute); + } + catch(NumberFormatException e) { + String comment = tokenizer.getSubstring(); + if(comment.charAt(0) == '#') { + break; + } + throw new RuntimeException("Parsing error in line " + getLineNumber() + ": expected data, got " + comment); + } + } + curvec = sparsefactory.newNumberVector(values, thismax); + curlbl = LabelList.make(labels); + values.clear(); + labels.clear(); + return true; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> { + @Override + protected void getFactory(Parameterization config) { + ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class); + if(config.grab(factoryP)) { + factory = factoryP.instantiateClass(config); + } + } + + @Override + protected void makeOptions(Parameterization config) { + // Avoid additional options: super.makeOptions(config); + getFactory(config); + } + + @Override + protected LibSVMFormatParser<V> makeInstance() { + return new LibSVMFormatParser<>((SparseNumberVector.Factory<V>) factory); + } + } +} |