summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java151
1 files changed, 151 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java
new file mode 100644
index 00000000..3a7ac44a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/LibSVMFormatParser.java
@@ -0,0 +1,151 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.LabelList;
+import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
+import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Parser to read libSVM format files.
+ *
+ *
+ * The format of libSVM is roughly specified in the README given:
+ *
+ * <pre>
+ * &lt;label&gt; &lt;index1&gt;:&lt;value1&gt; &lt;index2&gt;:&lt;value2&gt; ...
+ * </pre>
+ *
+ * i.e. a mandatory integer class label in the beginning followed by a classic
+ * sparse vector representation of the data. indexes are integers, starting at 1
+ * (Note that ELKI uses 0-based indexing, so we will map these to index-1) to
+ * not always have a constant-0 dimension 0.
+ *
+ * The FAQ states that you can also put comments into the file, separated by a
+ * dash: <tt>#</tt>, but they must not contain colons and are not officially
+ * supported. ELKI will simply stop parsing a line when encountering a
+ * <tt>#</tt>.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class LibSVMFormatParser<V extends SparseNumberVector> extends SparseNumberVectorLabelParser<V> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(LibSVMFormatParser.class);
+
+ /**
+ * LibSVM uses whitespace and colons for separation.
+ */
+ public static final Pattern WHITESPACE_PATTERN = Pattern.compile("(\\s+|:)");
+
+ /**
+ * Comment pattern.
+ */
+ public static final Pattern COMMENT_PATTERN = Pattern.compile("#");
+
+ /**
+ * Constructor.
+ *
+ * @param factory Vector factory
+ */
+ public LibSVMFormatParser(SparseNumberVector.Factory<V> factory) {
+ super(WHITESPACE_PATTERN, null, COMMENT_PATTERN, null, factory);
+ }
+
+ @Override
+ protected boolean parseLineInternal() {
+ /* tokenizer initialized by nextLineExceptComments() */
+ int thismax = 0;
+
+ // TODO: rely on the string being numeric for performance
+ // But it might be missing sometimes, or "?"
+ labels.add(tokenizer.getSubstring());
+ tokenizer.advance();
+ haslabels = true; // libSVM always has labels.
+
+ while(tokenizer.valid()) {
+ try {
+ int index = (int) tokenizer.getLongBase10();
+ tokenizer.advance();
+ double attribute = tokenizer.getDouble();
+ tokenizer.advance();
+ thismax = Math.max(thismax, index + 1);
+ values.put(index, attribute);
+ }
+ catch(NumberFormatException e) {
+ String comment = tokenizer.getSubstring();
+ if(comment.charAt(0) == '#') {
+ break;
+ }
+ throw new RuntimeException("Parsing error in line " + getLineNumber() + ": expected data, got " + comment);
+ }
+ }
+ curvec = sparsefactory.newNumberVector(values, thismax);
+ curlbl = LabelList.make(labels);
+ values.clear();
+ labels.clear();
+ return true;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
+ @Override
+ protected void getFactory(Parameterization config) {
+ ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ if(config.grab(factoryP)) {
+ factory = factoryP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // Avoid additional options: super.makeOptions(config);
+ getFactory(config);
+ }
+
+ @Override
+ protected LibSVMFormatParser<V> makeInstance() {
+ return new LibSVMFormatParser<>((SparseNumberVector.Factory<V>) factory);
+ }
+ }
+}