summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java122
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java20
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java62
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java74
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java20
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java17
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java109
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java93
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java73
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java38
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java146
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java53
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java2
16 files changed, 530 insertions, 315 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
index 1f414055..1e689638 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -54,7 +54,8 @@ public abstract class AbstractParser {
public static final char QUOTE_CHAR = '\"';
/**
- * A pattern catching most numbers that can be parsed using Double.parseDouble:
+ * A pattern catching most numbers that can be parsed using
+ * Double.parseDouble:
*
* Some examples: <code>1</code> <code>1.</code> <code>1.2</code>
* <code>.2</code> <code>-.2e-03</code>
@@ -62,16 +63,14 @@ public abstract class AbstractParser {
public static final String NUMBER_PATTERN = "[+-]?(?:\\d+\\.?|\\d*\\.\\d+)?(?:[eE][-]?\\d+)?";
/**
- * OptionID for the column separator parameter (defaults to whitespace as in
- * {@link #DEFAULT_SEPARATOR}.
+ * Default pattern for comments.
*/
- public static final OptionID COLUMN_SEPARATOR_ID = new OptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data.");
+ public static final String COMMENT_PATTERN = "^\\s*(#|//|;).*$";
/**
- * OptionID for the quote character parameter (defaults to a double quotation
- * mark as in {@link #QUOTE_CHAR}.
+ * A sign to separate attributes.
*/
- public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote.");
+ public static final String ATTRIBUTE_CONCATENATION = " ";
/**
* Stores the column separator pattern
@@ -84,25 +83,22 @@ public abstract class AbstractParser {
protected char quoteChar = QUOTE_CHAR;
/**
- * The comment character.
+ * Comment pattern.
*/
- public static final String COMMENT = "#";
-
- /**
- * A sign to separate attributes.
- */
- public static final String ATTRIBUTE_CONCATENATION = " ";
+ protected Pattern comment = null;
/**
* Constructor.
*
* @param colSep Column separator
* @param quoteChar Quote character
+ * @param comment Comment pattern
*/
- public AbstractParser(Pattern colSep, char quoteChar) {
+ public AbstractParser(Pattern colSep, char quoteChar, Pattern comment) {
super();
this.colSep = colSep;
this.quoteChar = quoteChar;
+ this.comment = comment;
}
/**
@@ -113,16 +109,16 @@ public abstract class AbstractParser {
* @return Tokenized string
*/
protected List<String> tokenize(String input) {
- ArrayList<String> matchList = new ArrayList<String>();
+ ArrayList<String> matchList = new ArrayList<>();
Matcher m = colSep.matcher(input);
int index = 0;
boolean inquote = (input.length() > 0) && (input.charAt(0) == quoteChar);
- while(m.find()) {
+ while (m.find()) {
// Quoted code path vs. regular code path
- if(inquote && m.start() > 0) {
+ if (inquote && m.start() > 0) {
// Closing quote found?
- if(m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) {
+ if (m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) {
// Strip quote characters
if (index + 1 < m.start() - 1) {
matchList.add(input.substring(index + 1, m.start() - 1));
@@ -132,8 +128,7 @@ public abstract class AbstractParser {
// new quote?
inquote = (index < input.length()) && (input.charAt(index) == quoteChar);
}
- }
- else {
+ } else {
// Add match before separator
if (index < m.start()) {
matchList.add(input.substring(index, m.start()));
@@ -145,25 +140,23 @@ public abstract class AbstractParser {
}
}
// Nothing found - return original string.
- if(index == 0) {
+ if (index == 0) {
matchList.add(input);
return matchList;
}
// Add tail after last separator.
- if(inquote) {
- if(input.charAt(input.length() - 1) == quoteChar) {
+ if (inquote) {
+ if (input.charAt(input.length() - 1) == quoteChar) {
if (index + 1 < input.length() - 1) {
matchList.add(input.substring(index + 1, input.length() - 1));
}
- }
- else {
- getLogger().warning("Invalid quoted line in input.");
+ } else {
+ getLogger().warning("Invalid quoted line in input: no closing quote found in: " + input);
if (index < input.length()) {
matchList.add(input.substring(index, input.length()));
}
}
- }
- else {
+ } else {
if (index < input.length()) {
matchList.add(input.substring(index, input.length()));
}
@@ -190,6 +183,43 @@ public abstract class AbstractParser {
}
/**
+ * Utility function, which is a bit more robust wrt. parsing double values. In
+ * particular: infinite values, and creates fewer objects.
+ *
+ * @param s String s
+ * @return parsed value
+ * @throws NumberFormatException
+ */
+ public static double parseDouble(String s) throws NumberFormatException {
+ try {
+ return Double.parseDouble(s);
+ } catch (NumberFormatException e) {
+ int len = s.length();
+ if (len > 0) {
+ int p = 0;
+ char cur = s.charAt(p);
+ boolean isNegative = cur == '-';
+ if (isNegative && ++p < len) {
+ cur = s.charAt(p);
+ }
+ if (cur == '∞') {
+ return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
+ }
+ if (len - p == 3 && "Inf".regionMatches(true, 0, s, p, 3)) {
+ return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
+ }
+ if (len - p == 8 && "Infinity".regionMatches(true, 0, s, p, 8)) {
+ return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
+ }
+ if (len == 3 && "NaN".equalsIgnoreCase(s)) {
+ return Double.NaN;
+ }
+ }
+ throw e;
+ }
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
@@ -198,6 +228,23 @@ public abstract class AbstractParser {
*/
public abstract static class Parameterizer extends AbstractParameterizer {
/**
+ * OptionID for the column separator parameter (defaults to whitespace as in
+ * {@link #DEFAULT_SEPARATOR}.
+ */
+ public static final OptionID COLUMN_SEPARATOR_ID = new OptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data.");
+
+ /**
+ * OptionID for the quote character parameter (defaults to a double
+ * quotation mark as in {@link #QUOTE_CHAR}.
+ */
+ public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote.");
+
+ /**
+ * Comment pattern.
+ */
+ public static final OptionID COMMENT_ID = new OptionID("string.comment", "Ignore lines in the input file that satisfy this pattern.");
+
+ /**
* Stores the column separator pattern
*/
protected Pattern colSep = null;
@@ -207,21 +254,30 @@ public abstract class AbstractParser {
*/
protected char quoteChar = QUOTE_CHAR;
+ /**
+ * Comment pattern.
+ */
+ protected Pattern comment = null;
+
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, DEFAULT_SEPARATOR);
- if(config.grab(colParam)) {
+ if (config.grab(colParam)) {
colSep = colParam.getValue();
}
StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR));
quoteParam.addConstraint(new StringLengthConstraint(1, 1));
- if(config.grab(quoteParam)) {
+ if (config.grab(quoteParam)) {
quoteChar = quoteParam.getValue().charAt(0);
}
+ PatternParameter commentP = new PatternParameter(COMMENT_ID, COMMENT_PATTERN);
+ if (config.grab(commentP)) {
+ comment = commentP.getValue();
+ }
}
@Override
protected abstract AbstractParser makeInstance();
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java
index 79f17326..53b4b6e8 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java
@@ -1,15 +1,10 @@
package de.lmu.ifi.dbs.elki.datasource.parser;
-import java.io.InputStream;
-import java.util.regex.Pattern;
-
-import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,6 +22,12 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+
+import java.io.InputStream;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+
/**
* Base class for streaming parsers.
*
@@ -38,9 +39,10 @@ public abstract class AbstractStreamingParser extends AbstractParser implements
*
* @param colSep Column separator pattern
* @param quoteChar Quote character
+ * @param comment Comment pattern
*/
- public AbstractStreamingParser(Pattern colSep, char quoteChar) {
- super(colSep, quoteChar);
+ public AbstractStreamingParser(Pattern colSep, char quoteChar, Pattern comment) {
+ super(colSep, quoteChar, comment);
}
@Override
@@ -48,4 +50,4 @@ public abstract class AbstractStreamingParser extends AbstractParser implements
this.initStream(in);
return MultipleObjectsBundle.fromStream(this);
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
index d1280fbe..718963d1 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
*/
import gnu.trove.iterator.TIntObjectIterator;
-import gnu.trove.map.hash.TIntFloatHashMap;
+import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import java.io.BufferedReader;
@@ -41,7 +41,7 @@ import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.ExternalID;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
-import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
+import de.lmu.ifi.dbs.elki.data.SparseDoubleVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
@@ -60,7 +60,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
* This parser is quite hackish, and contains lots of not yet configurable
* magic.
*
- * TODO: Sparse vectors are not yet fully supported.
+ * TODO: Allow configuration of the vector types (double, float)
+ *
+ * TODO: when encountering integer columns, produce integer vectors.
+ *
+ * TODO: allow optional class labels.
*
* @author Erich Schubert
*/
@@ -146,8 +150,8 @@ public class ArffParser implements Parser {
public MultipleObjectsBundle parse(InputStream instream) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(instream));
- ArrayList<String> names = new ArrayList<String>();
- ArrayList<String> types = new ArrayList<String>();
+ ArrayList<String> names = new ArrayList<>();
+ ArrayList<String> types = new ArrayList<>();
readHeader(br);
parseAttributeStatements(br, names, types);
@@ -205,7 +209,7 @@ public class ArffParser implements Parser {
private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
// logger.warning("Sparse instance.");
- TIntObjectHashMap<Object> map = new TIntObjectHashMap<Object>();
+ TIntObjectHashMap<Object> map = new TIntObjectHashMap<>();
while(true) {
nextToken(tokenizer);
assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
@@ -216,19 +220,21 @@ public class ArffParser implements Parser {
}
else {
// sparse token
- if(tokenizer.ttype != StreamTokenizer.TT_NUMBER) {
- throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
+ if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
+ throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
}
- int dim = (int) tokenizer.nval;
+ int dim = Integer.valueOf(tokenizer.sval);
if(map.containsKey(dim)) {
throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
}
nextToken(tokenizer);
- if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) {
- map.put(dim, Double.valueOf(tokenizer.nval));
- }
- else if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
- map.put(dim, tokenizer.sval);
+ if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]])) {
+ map.put(dim, AbstractParser.parseDouble(tokenizer.sval));
+ }
+ else {
+ map.put(dim, tokenizer.sval);
+ }
}
else {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
@@ -247,7 +253,7 @@ public class ArffParser implements Parser {
}
assert (s >= 0);
if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
- TIntFloatHashMap f = new TIntFloatHashMap(dimsize[out]);
+ TIntDoubleHashMap f = new TIntDoubleHashMap(dimsize[out]);
for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) {
iter.advance();
int i = iter.key();
@@ -258,9 +264,9 @@ public class ArffParser implements Parser {
break;
}
double v = ((Double) iter.value()).doubleValue();
- f.put(i - s + 1, (float) v);
+ f.put(i - s, v);
}
- data[out] = new SparseFloatVector(f, dimsize[out]);
+ data[out] = new SparseDoubleVector(f, dimsize[out]);
}
else if(TypeUtil.LABELLIST.equals(elkitypes[out])) {
// Build a label list out of successive labels
@@ -292,10 +298,10 @@ public class ArffParser implements Parser {
}
}
else if(TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
- String val = (String) map.get(s);
+ Object val = map.get(s);
if(val != null) {
// TODO: support other class label types.
- ClassLabel lbl = new SimpleClassLabel(val);
+ ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
data[out] = lbl;
}
else {
@@ -321,7 +327,7 @@ public class ArffParser implements Parser {
}
else if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
try {
- cur[k] = Double.parseDouble(tokenizer.sval);
+ cur[k] = AbstractParser.parseDouble(tokenizer.sval);
}
catch(NumberFormatException e) {
throw new AbortException("Expected number value, got: " + tokenizer.sval);
@@ -381,7 +387,7 @@ public class ArffParser implements Parser {
{
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, ' ');
- tokenizer.ordinaryChars('0', '9');
+ tokenizer.ordinaryChars('0', '9'); // Do not parse numbers
tokenizer.ordinaryChar('-');
tokenizer.ordinaryChar('.');
tokenizer.wordChars(' ' + 1, '\u00FF');
@@ -421,12 +427,12 @@ public class ArffParser implements Parser {
labels[i] = names.get(out + i);
}
if(!sparse) {
- VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.FACTORY, dimsize[out], labels);
+ VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<DoubleVector>());
}
else {
- VectorFieldTypeInformation<SparseFloatVector> type = new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.FACTORY, dimsize[out], labels);
- bundle.appendColumn(type, new ArrayList<SparseFloatVector>());
+ VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels);
+ bundle.appendColumn(type, new ArrayList<SparseDoubleVector>());
}
}
else if(TypeUtil.LABELLIST.equals(etyp[out])) {
@@ -434,13 +440,13 @@ public class ArffParser implements Parser {
for(int i = 1; i < dimsize[out]; i++) {
label.append(' ').append(names.get(out + i));
}
- bundle.appendColumn(new SimpleTypeInformation<LabelList>(LabelList.class, label.toString()), new ArrayList<LabelList>());
+ bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>());
}
else if(TypeUtil.EXTERNALID.equals(etyp[out])) {
- bundle.appendColumn(new SimpleTypeInformation<ExternalID>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
+ bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
}
else if(TypeUtil.CLASSLABEL.equals(etyp[out])) {
- bundle.appendColumn(new SimpleTypeInformation<ClassLabel>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
+ bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
}
else {
throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
index 32a26d7d..07019040 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -63,11 +63,12 @@ public class BitVectorLabelParser extends AbstractParser implements Parser {
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
*/
- public BitVectorLabelParser(Pattern colSep, char quoteChar) {
- super(colSep, quoteChar);
+ public BitVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment) {
+ super(colSep, quoteChar, comment);
}
@Override
@@ -75,48 +76,47 @@ public class BitVectorLabelParser extends AbstractParser implements Parser {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
int lineNumber = 0;
int dimensionality = -1;
- List<BitVector> vectors = new ArrayList<BitVector>();
- List<LabelList> labels = new ArrayList<LabelList>();
+ List<BitVector> vectors = new ArrayList<>();
+ List<LabelList> labels = new ArrayList<>();
try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- if(!line.startsWith(COMMENT) && line.length() > 0) {
- List<String> entries = tokenize(line);
- // FIXME: use more efficient storage right away?
- List<Bit> attributes = new ArrayList<Bit>();
- LabelList ll = null;
- for(String entry : entries) {
- try {
- Bit attribute = Bit.valueOf(entry);
- attributes.add(attribute);
- }
- catch(NumberFormatException e) {
- if(ll == null) {
- ll = new LabelList(1);
- }
- ll.add(entry);
+ for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ // Skip empty lines and comments
+ if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ continue;
+ }
+ List<String> entries = tokenize(line);
+ // FIXME: use more efficient storage right away?
+ List<Bit> attributes = new ArrayList<>();
+ LabelList ll = null;
+ for (String entry : entries) {
+ try {
+ Bit attribute = Bit.valueOf(entry);
+ attributes.add(attribute);
+ } catch (NumberFormatException e) {
+ if (ll == null) {
+ ll = new LabelList(1);
}
+ ll.add(entry);
}
+ }
- if(dimensionality < 0) {
- dimensionality = attributes.size();
- }
- else if(dimensionality != attributes.size()) {
- throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ".");
- }
-
- vectors.add(new BitVector(attributes.toArray(new Bit[attributes.size()])));
- labels.add(ll);
+ if (dimensionality < 0) {
+ dimensionality = attributes.size();
+ } else if (dimensionality != attributes.size()) {
+ throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + ".");
}
+
+ vectors.add(new BitVector(attributes.toArray(new Bit[attributes.size()])));
+ labels.add(ll);
}
- }
- catch(IOException e) {
+ } catch (IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels);
}
protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<BitVector>(BitVector.FACTORY, dimensionality);
+ return new VectorFieldTypeInformation<>(BitVector.FACTORY, dimensionality);
}
@Override
@@ -134,7 +134,7 @@ public class BitVectorLabelParser extends AbstractParser implements Parser {
public static class Parameterizer extends AbstractParser.Parameterizer {
@Override
protected BitVectorLabelParser makeInstance() {
- return new BitVectorLabelParser(colSep, quoteChar);
+ return new BitVectorLabelParser(colSep, quoteChar, comment);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
index 0c291fb4..b95dce74 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
@@ -49,7 +49,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
*
* @apiviz.has DoubleVector
*
- * @deprecated Use NumberVectorLabelParser instead, which defaults to DoubleVector.
+ * @deprecated Use NumberVectorLabelParser instead, which defaults to
+ * DoubleVector.
*/
@Deprecated
public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVector> {
@@ -61,19 +62,20 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public DoubleVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, DoubleVector.FACTORY);
+ public DoubleVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices) {
+ super(colSep, quoteChar, comment, labelIndices, DoubleVector.FACTORY);
}
/**
* Constructor with default values.
*/
public DoubleVectorLabelParser() {
- this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, new BitSet());
+ this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, Pattern.compile(COMMENT_PATTERN), new BitSet());
}
@Override
@@ -96,7 +98,7 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto
@Override
protected DoubleVectorLabelParser makeInstance() {
- return new DoubleVectorLabelParser(colSep, quoteChar, labelIndices);
+ return new DoubleVectorLabelParser(colSep, quoteChar, comment, labelIndices);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
index 6288da8e..71b65cfc 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2012
+Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -57,19 +57,20 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
@Deprecated
public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector> {
/**
- * Class logger
+ * Class logger.
*/
private static final Logging LOG = Logging.getLogger(FloatVectorLabelParser.class);
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public FloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, FloatVector.FACTORY);
+ public FloatVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices) {
+ super(colSep, quoteChar, comment, labelIndices, FloatVector.FACTORY);
}
@Override
@@ -92,7 +93,7 @@ public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector>
@Override
protected FloatVectorLabelParser makeInstance() {
- return new FloatVectorLabelParser(colSep, quoteChar, labelIndices);
+ return new FloatVectorLabelParser(colSep, quoteChar, comment, labelIndices);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index ea44c072..39da752b 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -40,6 +40,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
@@ -97,16 +98,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes.");
/**
- * Constant used for unknown dimensionality (e.g. empty files)
- */
- public static final int DIMENSIONALITY_UNKNOWN = -1;
-
- /**
- * Constant used for records of variable dimensionality (e.g. time series)
- */
- public static final int DIMENSIONALITY_VARIABLE = -2;
-
- /**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices;
@@ -129,7 +120,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* Dimensionality reported.
*/
- protected int dimensionality;
+ protected int mindim, maxdim;
/**
* Metadata.
@@ -167,7 +158,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param factory Vector factory
*/
public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) {
- this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, null, factory);
+ this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, Pattern.compile(COMMENT_PATTERN), null, factory);
}
/**
@@ -175,11 +166,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*
* @param colSep Column separator
* @param quoteChar Quote character
+ * @param comment Comment pattern
* @param labelIndices Column indexes that are numeric.
* @param factory Vector factory
*/
- public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
- super(colSep, quoteChar);
+ public NumberVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
+ super(colSep, quoteChar, comment);
this.labelIndices = labelIndices;
this.factory = factory;
}
@@ -188,7 +180,8 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
public void initStream(InputStream in) {
reader = new BufferedReader(new InputStreamReader(in));
lineNumber = 1;
- dimensionality = DIMENSIONALITY_UNKNOWN;
+ mindim = Integer.MAX_VALUE;
+ maxdim = 0;
columnnames = null;
labelcolumns = new BitSet();
if (labelIndices != null) {
@@ -210,31 +203,34 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
}
try {
for (String line; (line = reader.readLine()) != null; lineNumber++) {
- if (!line.startsWith(COMMENT) && line.length() > 0) {
- parseLineInternal(line);
- // Maybe a header column?
- if (curvec == null) {
- continue;
- }
- if (dimensionality == DIMENSIONALITY_UNKNOWN) {
- dimensionality = curvec.getDimensionality();
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
- } else if (dimensionality > 0) {
- if (dimensionality != curvec.getDimensionality()) {
- dimensionality = DIMENSIONALITY_VARIABLE;
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
- }
- } else if (curlbl != null && meta != null && meta.size() == 1) {
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
- }
- return Event.NEXT_OBJECT;
+ // Skip empty lines and comments
+ if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ continue;
+ }
+ parseLineInternal(line);
+ // Maybe a header column?
+ if (curvec == null) {
+ continue;
}
+ final int curdim = curvec.getDimensionality();
+ if (maxdim < mindim) {
+ mindim = curdim;
+ maxdim = curdim;
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ } else if (mindim < curdim || maxdim > curdim) {
+ mindim = Math.min(mindim, curdim);
+ maxdim = Math.max(maxdim, curdim);
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ } else if (curlbl != null && meta != null && meta.size() == 1) {
+ buildMeta();
+ nextevent = Event.NEXT_OBJECT;
+ return Event.META_CHANGED;
+ }
+ return Event.NEXT_OBJECT;
}
reader.close();
reader = null;
@@ -250,11 +246,11 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
protected void buildMeta() {
if (labelcolumns.cardinality() > 0 || (labelIndices != null && labelIndices.cardinality() > 0)) {
meta = new BundleMeta(2);
- meta.add(getTypeInformation(dimensionality));
+ meta.add(getTypeInformation(mindim, maxdim));
meta.add(TypeUtil.LABELLIST);
} else {
meta = new BundleMeta(1);
- meta.add(getTypeInformation(dimensionality));
+ meta.add(getTypeInformation(mindim, maxdim));
}
}
@@ -287,7 +283,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
String ent = itr.next();
if (labelIndices == null || !labelIndices.get(i)) {
try {
- double attribute = Double.parseDouble(ent);
+ double attribute = parseDouble(ent);
attributes.add(attribute);
continue;
} catch (NumberFormatException e) {
@@ -333,15 +329,16 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
/**
* Get a prototype object for the given dimensionality.
*
- * @param dimensionality Dimensionality
+ * @param mindim Minimum dimensionality
+ * @param maxdim Maximum dimensionality
* @return Prototype object
*/
- SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
- if (dimensionality > 0) {
+ SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
+ if (mindim == maxdim) {
String[] colnames = null;
if (columnnames != null) {
- if (columnnames.size() - labelcolumns.cardinality() == dimensionality) {
- colnames = new String[dimensionality];
+ if (columnnames.size() - labelcolumns.cardinality() == mindim) {
+ colnames = new String[mindim];
for (int i = 0, j = 0; i < columnnames.size(); i++) {
if (!labelcolumns.get(i)) {
colnames[j] = columnnames.get(i);
@@ -350,13 +347,13 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
}
}
}
- return new VectorFieldTypeInformation<V>(factory, dimensionality, colnames);
- }
- // Variable dimensionality - return non-vector field type
- if (dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
+ return new VectorFieldTypeInformation<>(factory, mindim, colnames);
+ } else if (mindim < maxdim) {
+ // Variable dimensionality - return non-vector field type
+ return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
+ } else {
+ throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
- throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@Override
@@ -395,7 +392,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param config Parameterization
*/
protected void getFactory(Parameterization config) {
- ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<NumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
+ ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
if (config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -420,7 +417,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
@Override
protected NumberVectorLabelParser<V> makeInstance() {
- return new NumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory);
+ return new NumberVectorLabelParser<>(colSep, quoteChar, comment, labelIndices, factory);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
index 5b511a92..a0b4e573 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
import java.io.InputStream;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
-import de.lmu.ifi.dbs.elki.utilities.InspectionUtilFrequentlyScanned;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
/**
@@ -38,7 +37,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
* @apiviz.uses InputStream
* @apiviz.has MultipleObjectsBundle oneway - - «create»
*/
-public interface Parser extends Parameterizable, InspectionUtilFrequentlyScanned {
+public interface Parser extends Parameterizable {
/**
* Returns a list of the objects parsed from the specified input stream.
*
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
index ce366b9e..a3d46ed8 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -77,11 +77,12 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
*/
- public SimplePolygonParser(Pattern colSep, char quoteChar) {
- super(colSep, quoteChar);
+ public SimplePolygonParser(Pattern colSep, char quoteChar, Pattern comment) {
+ super(colSep, quoteChar, comment);
}
@Override
@@ -89,35 +90,35 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
int lineNumber = 1;
- List<PolygonsObject> polys = new ArrayList<PolygonsObject>();
+ List<PolygonsObject> polys = new ArrayList<>();
List<LabelList> labels = null;
- List<ExternalID> eids = new ArrayList<ExternalID>();
+ List<ExternalID> eids = new ArrayList<>();
try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- if(!line.startsWith(COMMENT) && line.length() > 0) {
- Object[] objs = parseLine(line);
- polys.add((PolygonsObject) objs[0]);
- if(objs[1] != null) {
- if(labels == null) {
- labels = new ArrayList<LabelList>();
- for(int i = 0; i < polys.size() - 1; i++) {
- labels.add(null);
- }
+ for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ // Skip empty lines and comments
+ if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ continue;
+ }
+ Object[] objs = parseLine(line);
+ polys.add((PolygonsObject) objs[0]);
+ if (objs[1] != null) {
+ if (labels == null) {
+ labels = new ArrayList<>();
+ for (int i = 0; i < polys.size() - 1; i++) {
+ labels.add(null);
}
- labels.add((LabelList) objs[1]);
}
- eids.add((ExternalID) objs[2]);
+ labels.add((LabelList) objs[1]);
}
+ eids.add((ExternalID) objs[2]);
}
- }
- catch(IOException e) {
+ } catch (IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
- if(labels != null) {
+ if (labels != null) {
return MultipleObjectsBundle.makeSimple(TypeUtil.POLYGON_TYPE, polys, TypeUtil.LABELLIST, labels, TypeUtil.EXTERNALID, eids);
- }
- else {
+ } else {
return MultipleObjectsBundle.makeSimple(TypeUtil.POLYGON_TYPE, polys, TypeUtil.EXTERNALID, eids);
}
}
@@ -135,51 +136,48 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
ExternalID eid = null;
LabelList labels = null;
- List<Polygon> polys = new ArrayList<Polygon>(1);
+ List<Polygon> polys = new ArrayList<>(1);
- List<Vector> coords = new ArrayList<Vector>();
- while(iter.hasNext()) {
+ List<Vector> coords = new ArrayList<>();
+ while (iter.hasNext()) {
String cur = iter.next();
Matcher m = COORD.matcher(cur);
- if(m.find()) {
+ if (m.find()) {
try {
double c1 = Double.parseDouble(m.group(1));
double c2 = Double.parseDouble(m.group(2));
- if(m.group(3) != null) {
+ if (m.group(3) != null) {
double c3 = Double.parseDouble(m.group(3));
coords.add(new Vector(new double[] { c1, c2, c3 }));
- }
- else {
+ } else {
coords.add(new Vector(new double[] { c1, c2 }));
}
continue;
- }
- catch(NumberFormatException e) {
+ } catch (NumberFormatException e) {
LOG.warning("Looked like a coordinate pair but didn't parse: " + cur);
}
}
// Polygon separator.
- if(cur.equals(POLYGON_SEPARATOR)) {
- if(coords.size() > 0) {
+ if (cur.equals(POLYGON_SEPARATOR)) {
+ if (coords.size() > 0) {
polys.add(new Polygon(coords));
- coords = new ArrayList<Vector>();
+ coords = new ArrayList<>();
}
continue;
}
// First label will become the External ID
- if(eid == null) {
+ if (eid == null) {
eid = new ExternalID(cur);
- }
- else {
+ } else {
// Label
- if(labels == null) {
+ if (labels == null) {
labels = new LabelList(1);
}
labels.add(cur);
}
}
// Complete polygon
- if(coords.size() > 0) {
+ if (coords.size() > 0) {
polys.add(new Polygon(coords));
}
return new Object[] { new PolygonsObject(polys), labels, eid };
@@ -201,19 +199,24 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
@Override
protected void makeOptions(Parameterization config) {
PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, "\\s+");
- if(config.grab(colParam)) {
+ if (config.grab(colParam)) {
colSep = colParam.getValue();
}
StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR));
quoteParam.addConstraint(new StringLengthConstraint(1, 1));
- if(config.grab(quoteParam)) {
+ if (config.grab(quoteParam)) {
quoteChar = quoteParam.getValue().charAt(0);
}
+
+ PatternParameter commentP = new PatternParameter(COMMENT_ID, COMMENT_PATTERN);
+ if (config.grab(commentP)) {
+ comment = commentP.getValue();
+ }
}
@Override
protected SimplePolygonParser makeInstance() {
- return new SimplePolygonParser(colSep, quoteChar);
+ return new SimplePolygonParser(colSep, quoteChar, comment);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
index 35e53bb7..5f9e5e05 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -64,11 +64,12 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
*/
- public SparseBitVectorLabelParser(Pattern colSep, char quoteChar) {
- super(colSep, quoteChar);
+ public SparseBitVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment) {
+ super(colSep, quoteChar, comment);
}
@Override
@@ -76,54 +77,54 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
int lineNumber = 0;
int dimensionality = -1;
- List<BitVector> vectors = new ArrayList<BitVector>();
- List<LabelList> lblc = new ArrayList<LabelList>();
+ List<BitVector> vectors = new ArrayList<>();
+ List<LabelList> lblc = new ArrayList<>();
try {
- List<BitSet> bitSets = new ArrayList<BitSet>();
- List<LabelList> allLabels = new ArrayList<LabelList>();
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- if(!line.startsWith(COMMENT) && line.length() > 0) {
- List<String> entries = tokenize(line);
- BitSet bitSet = new BitSet();
- LabelList labels = null;
-
- for(String entry : entries) {
- try {
- int index = Integer.parseInt(entry);
- bitSet.set(index);
- dimensionality = Math.max(dimensionality, index);
- }
- catch(NumberFormatException e) {
- if(labels == null) {
- labels = new LabelList(1);
- }
- labels.add(entry);
+ List<BitSet> bitSets = new ArrayList<>();
+ List<LabelList> allLabels = new ArrayList<>();
+ for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ // Skip empty lines and comments
+ if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ continue;
+ }
+ List<String> entries = tokenize(line);
+ BitSet bitSet = new BitSet();
+ LabelList labels = null;
+
+ for (String entry : entries) {
+ try {
+ int index = Integer.parseInt(entry);
+ bitSet.set(index);
+ dimensionality = Math.max(dimensionality, index);
+ } catch (NumberFormatException e) {
+ if (labels == null) {
+ labels = new LabelList(1);
}
+ labels.add(entry);
}
-
- bitSets.add(bitSet);
- allLabels.add(labels);
}
+
+ bitSets.add(bitSet);
+ allLabels.add(labels);
}
dimensionality++;
- for(int i = 0; i < bitSets.size(); i++) {
+ for (int i = 0; i < bitSets.size(); i++) {
BitSet bitSet = bitSets.get(i);
LabelList labels = allLabels.get(i);
vectors.add(new BitVector(bitSet, dimensionality));
lblc.add(labels);
}
- }
- catch(IOException e) {
+ } catch (IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, lblc);
}
protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<BitVector>(BitVector.FACTORY, dimensionality);
+ return new VectorFieldTypeInformation<>(BitVector.FACTORY, dimensionality);
}
-
+
@Override
protected Logging getLogger() {
return LOG;
@@ -139,7 +140,7 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
public static class Parameterizer extends AbstractParser.Parameterizer {
@Override
protected SparseBitVectorLabelParser makeInstance() {
- return new SparseBitVectorLabelParser(colSep, quoteChar);
+ return new SparseBitVectorLabelParser(colSep, quoteChar, comment);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
index 9f658b0a..d5fe6219 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
@@ -75,10 +75,11 @@ public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser<
*
* @param colSep Column separator
* @param quoteChar Quotation character
- * @param labelIndices Label indexes
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public SparseFloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, SparseFloatVector.FACTORY);
+ public SparseFloatVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices) {
+ super(colSep, quoteChar, comment, labelIndices, SparseFloatVector.FACTORY);
}
/**
@@ -91,7 +92,7 @@ public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser<
public static class Parameterizer extends SparseNumberVectorLabelParser.Parameterizer<SparseFloatVector> {
@Override
protected SparseFloatVectorLabelParser makeInstance() {
- return new SparseFloatVectorLabelParser(colSep, quoteChar, labelIndices);
+ return new SparseFloatVectorLabelParser(colSep, quoteChar, comment, labelIndices);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
index f4ec8c59..bdd8ab77 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
@@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -86,12 +87,6 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
private static final Logging LOG = Logging.getLogger(SparseNumberVectorLabelParser.class);
/**
- * Holds the dimensionality of the parsed data which is the maximum occurring
- * index of any attribute.
- */
- private int maxdim = -1;
-
- /**
* Same as {@link #factory}, but subtype.
*/
private SparseNumberVector.Factory<V, ?> sparsefactory;
@@ -101,11 +96,12 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
*
* @param colSep Column separator
* @param quoteChar Quotation character
- * @param labelIndices Label indexes
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
* @param factory Vector factory
*/
- public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
- super(colSep, quoteChar, labelIndices, factory);
+ public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ super(colSep, quoteChar, comment, labelIndices, factory);
this.sparsefactory = factory;
}
@@ -116,7 +112,8 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
TIntDoubleHashMap values = new TIntDoubleHashMap(cardinality, 1);
LabelList labels = null;
-
+ int thismax = 0;
+
for (int i = 1; i < entries.size() - 1; i++) {
if (labelIndices == null || !labelIndices.get(i)) {
try {
@@ -124,7 +121,8 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
if (index >= maxdim) {
maxdim = index + 1;
}
- double attribute = Double.parseDouble(entries.get(i));
+ thismax = Math.max(thismax, index);
+ double attribute = parseDouble(entries.get(i));
values.put(index, attribute);
i++;
} catch (NumberFormatException e) {
@@ -144,17 +142,19 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
if (values.size() > maxdim) {
throw new AbortException("Invalid sparse vector seen: " + line);
}
+ if (thismax < mindim) {
+ mindim = thismax;
+ }
curvec = sparsefactory.newNumberVector(values, maxdim);
curlbl = labels;
}
@Override
- protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
- if (dimensionality > 0) {
- return new VectorFieldTypeInformation<V>(factory, dimensionality);
- }
- if (dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
+ protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
+ if (mindim == maxdim) {
+ return new VectorFieldTypeInformation<>(factory, mindim);
+ } else if (mindim < maxdim) {
+ return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@@ -174,7 +174,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<SparseNumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
if (config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -182,7 +182,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte
@Override
protected SparseNumberVectorLabelParser<V> makeInstance() {
- return new SparseNumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
+ return new SparseNumberVectorLabelParser<>(colSep, quoteChar, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java
index 01579dc6..73d38e3c 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java
@@ -8,7 +8,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java
new file mode 100644
index 00000000..41f21c5d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java
@@ -0,0 +1,146 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.LabelList;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
+
+/**
+ * Parser that loads a text file for use with string similarity measures.
+ *
+ * The parser produces two relations: the first of type String, the second of
+ * type label list, which contains the same data for convenience.
+ *
+ * @author Felix Stahlberg
+ * @author Erich Schubert
+ */
+@Title("String Parser")
+@Description("Parses new line separated strings")
+public class StringParser implements Parser {
+ /**
+ * Comment pattern.
+ */
+ Pattern comment;
+
+ /**
+ * Flag to trim whitespace.
+ */
+ boolean trimWhitespace;
+
+ /**
+ * Constructor.
+ *
+ * @param comment Pattern for comments.
+ * @param trimWhitespace Trim leading and trailing whitespace.
+ */
+ public StringParser(Pattern comment, boolean trimWhitespace) {
+ super();
+ this.comment = comment;
+ this.trimWhitespace = trimWhitespace;
+ }
+
+ @Override
+ public MultipleObjectsBundle parse(InputStream in) {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+ int lineNumber = 0;
+ List<String> data = new ArrayList<>();
+ List<LabelList> labels = new ArrayList<>();
+ try {
+ for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ // Skip empty lines and comments
+ if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ continue;
+ }
+ final String val = trimWhitespace ? line.trim() : line;
+ data.add(val);
+ LabelList ll = new LabelList(1);
+ ll.add(val);
+ labels.add(ll);
+ }
+ } catch (IOException e) {
+ throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
+ }
+ return MultipleObjectsBundle.makeSimple(TypeUtil.STRING, data, TypeUtil.LABELLIST, labels);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Felix Stahlberg
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Flag to trim whitespace.
+ */
+ public static final OptionID TRIM_ID = new OptionID("string.trim", "Remove leading and trailing whitespace from each line.");
+
+ /**
+ * Comment pattern.
+ */
+ Pattern comment = null;
+
+ /**
+ * Flag to trim whitespace.
+ */
+ boolean trimWhitespace = false;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ PatternParameter commentP = new PatternParameter(AbstractParser.Parameterizer.COMMENT_ID, "^\\s*#.*$");
+ if (config.grab(commentP)) {
+ comment = commentP.getValue();
+ }
+
+ Flag trimP = new Flag(TRIM_ID);
+ if (config.grab(trimP)) {
+ trimWhitespace = trimP.isTrue();
+ }
+ }
+
+ @Override
+ protected StringParser makeInstance() {
+ return new StringParser(comment, trimWhitespace);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
index 2ea6ebb5..580c5320 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2012
+ Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,10 +24,11 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
*/
import gnu.trove.iterator.TIntDoubleIterator;
+import gnu.trove.map.TObjectIntMap;
import gnu.trove.map.hash.TIntDoubleHashMap;
+import gnu.trove.map.hash.TObjectIntHashMap;
import java.util.BitSet;
-import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
@@ -36,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -62,14 +64,14 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class);
/**
- * Maximum dimension used.
+ * Number of different terms observed.
*/
- int maxdim;
+ int numterms;
/**
* Map.
*/
- HashMap<String, Integer> keymap;
+ TObjectIntMap<String> keymap;
/**
* Normalize.
@@ -85,15 +87,15 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
* Constructor.
*
* @param normalize Normalize
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public TermFrequencyParser(boolean normalize, Pattern colSep, char quoteChar, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
- super(colSep, quoteChar, labelIndices, factory);
+ public TermFrequencyParser(boolean normalize, Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ super(colSep, quoteChar, comment, labelIndices, factory);
this.normalize = normalize;
- this.maxdim = 0;
- this.keymap = new HashMap<String, Integer>();
+ this.keymap = new TObjectIntHashMap<>(1001, .5f, -1);
this.sparsefactory = factory;
}
@@ -111,12 +113,12 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
curterm = entries.get(i);
} else {
try {
- double attribute = Double.parseDouble(entries.get(i));
- Integer curdim = keymap.get(curterm);
- if (curdim == null) {
- curdim = Integer.valueOf(maxdim + 1);
+ double attribute = parseDouble(entries.get(i));
+ int curdim = keymap.get(curterm);
+ if (curdim < 0) {
+ curdim = numterms;
keymap.put(curterm, curdim);
- maxdim += 1;
+ ++numterms;
}
values.put(curdim, attribute);
len += attribute;
@@ -147,17 +149,16 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
}
}
- curvec = sparsefactory.newNumberVector(values, maxdim);
+ curvec = sparsefactory.newNumberVector(values, numterms);
curlbl = labels;
}
@Override
- protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
- if (dimensionality > 0) {
- return new VectorFieldTypeInformation<V>(factory, dimensionality);
- }
- if (dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
+ protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
+ if (mindim == maxdim) {
+ return new VectorFieldTypeInformation<>(factory, mindim);
+ } else if (mindim < maxdim) {
+ return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@@ -196,7 +197,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<SparseNumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
if (config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -204,7 +205,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number
@Override
protected TermFrequencyParser<V> makeInstance() {
- return new TermFrequencyParser<V>(normalize, colSep, quoteChar, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
+ return new TermFrequencyParser<>(normalize, colSep, quoteChar, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
index 58ae9a77..c21ab31f 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
@@ -42,7 +42,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2012
+Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team