summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java76
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java161
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java21
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java63
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java102
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java5
13 files changed, 248 insertions, 226 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
index 3c294ca4..1f414055 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
@@ -65,13 +65,13 @@ public abstract class AbstractParser {
* OptionID for the column separator parameter (defaults to whitespace as in
* {@link #DEFAULT_SEPARATOR}.
*/
- public static final OptionID COLUMN_SEPARATOR_ID = OptionID.getOrCreateOptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data.");
+ public static final OptionID COLUMN_SEPARATOR_ID = new OptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data.");
/**
* OptionID for the quote character parameter (defaults to a double quotation
* mark as in {@link #QUOTE_CHAR}.
*/
- public static final OptionID QUOTE_ID = OptionID.getOrCreateOptionID("parser.quote", "Quotation character. The default is to use a double quote.");
+ public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote.");
/**
* Stores the column separator pattern
@@ -196,7 +196,7 @@ public abstract class AbstractParser {
*
* @apiviz.exclude
*/
- public static abstract class Parameterizer extends AbstractParameterizer {
+ public abstract static class Parameterizer extends AbstractParameterizer {
/**
* Stores the column separator pattern
*/
@@ -214,7 +214,8 @@ public abstract class AbstractParser {
if(config.grab(colParam)) {
colSep = colParam.getValue();
}
- StringParameter quoteParam = new StringParameter(QUOTE_ID, new StringLengthConstraint(1, 1), ""+QUOTE_CHAR);
+ StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR));
+ quoteParam.addConstraint(new StringLengthConstraint(1, 1));
if(config.grab(quoteParam)) {
quoteChar = quoteParam.getValue().charAt(0);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
index 9d77b921..d1280fbe 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
@@ -66,22 +66,22 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
*/
public class ArffParser implements Parser {
/**
- * Logger
+ * Logger.
*/
- private static final Logging logger = Logging.getLogger(ArffParser.class);
+ private static final Logging LOG = Logging.getLogger(ArffParser.class);
/**
- * Arff file marker
+ * Arff file marker.
*/
public static final Pattern ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE);
/**
- * Arff attribute declaration marker
+ * Arff attribute declaration marker.
*/
public static final Pattern ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE);
/**
- * Arff data marker
+ * Arff data marker.
*/
public static final Pattern ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE);
@@ -101,7 +101,7 @@ public class ArffParser implements Parser {
public static final String DEFAULT_ARFF_MAGIC_CLASS = "(Class|Class-?Label)";
/**
- * Pattern for numeric columns
+ * Pattern for numeric columns.
*/
public static final Pattern ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE);
@@ -111,12 +111,12 @@ public class ArffParser implements Parser {
public static final Pattern EMPTY = Pattern.compile("^\\s*$");
/**
- * Pattern to recognize external ids
+ * Pattern to recognize external ids.
*/
Pattern magic_eid;
/**
- * Pattern to recognize class label columns
+ * Pattern to recognize class label columns.
*/
Pattern magic_class;
@@ -225,7 +225,7 @@ public class ArffParser implements Parser {
}
nextToken(tokenizer);
if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) {
- map.put(dim, tokenizer.nval);
+ map.put(dim, Double.valueOf(tokenizer.nval));
}
else if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
map.put(dim, tokenizer.sval);
@@ -246,7 +246,7 @@ public class ArffParser implements Parser {
}
}
assert (s >= 0);
- if(elkitypes[out] == TypeUtil.NUMBER_VECTOR_FIELD) {
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
TIntFloatHashMap f = new TIntFloatHashMap(dimsize[out]);
for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) {
iter.advance();
@@ -257,12 +257,12 @@ public class ArffParser implements Parser {
if(i >= s + dimsize[out]) {
break;
}
- double v = (Double) iter.value();
+ double v = ((Double) iter.value()).doubleValue();
f.put(i - s + 1, (float) v);
}
data[out] = new SparseFloatVector(f, dimsize[out]);
}
- else if(elkitypes[out] == TypeUtil.LABELLIST) {
+ else if(TypeUtil.LABELLIST.equals(elkitypes[out])) {
// Build a label list out of successive labels
LabelList ll = new LabelList(1);
for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) {
@@ -276,13 +276,13 @@ public class ArffParser implements Parser {
}
String v = (String) iter.value();
if(ll.size() < i - s) {
- logger.warning("Sparse consecutive labels are currently not correctly supported.");
+ LOG.warning("Sparse consecutive labels are currently not correctly supported.");
}
ll.add(v);
}
data[out] = ll;
}
- else if(elkitypes[out] == TypeUtil.EXTERNALID) {
+ else if(TypeUtil.EXTERNALID.equals(elkitypes[out])) {
String val = (String) map.get(s);
if(val != null) {
data[out] = new ExternalID(val);
@@ -291,7 +291,7 @@ public class ArffParser implements Parser {
throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
}
}
- else if(elkitypes[out] == TypeUtil.CLASSLABEL) {
+ else if(TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
String val = (String) map.get(s);
if(val != null) {
// TODO: support other class label types.
@@ -312,7 +312,7 @@ public class ArffParser implements Parser {
private Object[] loadDenseInstance(StreamTokenizer tokenizer, int[] dimsize, TypeInformation[] etyp, int outdim) throws IOException {
Object[] data = new Object[outdim];
for(int out = 0; out < outdim; out++) {
- if(etyp[out] == TypeUtil.NUMBER_VECTOR_FIELD) {
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
// For multi-column vectors, read successive columns
double[] cur = new double[dimsize[out]];
for(int k = 0; k < dimsize[out]; k++) {
@@ -334,7 +334,7 @@ public class ArffParser implements Parser {
}
data[out] = new DoubleVector(cur);
}
- else if(etyp[out] == TypeUtil.LABELLIST) {
+ else if(TypeUtil.LABELLIST.equals(etyp[out])) {
// Build a label list out of successive labels
LabelList ll = new LabelList(dimsize[out]);
for(int k = 0; k < dimsize[out]; k++) {
@@ -346,14 +346,14 @@ public class ArffParser implements Parser {
}
data[out] = ll;
}
- else if(etyp[out] == TypeUtil.EXTERNALID) {
+ else if(TypeUtil.EXTERNALID.equals(etyp[out])) {
if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Expected word token, got: " + tokenizer.toString());
}
data[out] = new ExternalID(tokenizer.sval);
nextToken(tokenizer);
}
- else if(etyp[out] == TypeUtil.CLASSLABEL) {
+ else if(TypeUtil.CLASSLABEL.equals(etyp[out])) {
if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Expected word token, got: " + tokenizer.toString());
}
@@ -414,32 +414,32 @@ public class ArffParser implements Parser {
break;
}
}
- if(etyp[out] == TypeUtil.NUMBER_VECTOR_FIELD) {
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
String[] labels = new String[dimsize[out]];
// Collect labels:
for(int i = 0; i < dimsize[out]; i++) {
labels[i] = names.get(out + i);
}
if(!sparse) {
- VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, DoubleVector.STATIC, dimsize[out], labels, new DoubleVector(new double[dimsize[out]]));
+ VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<DoubleVector>());
}
else {
- VectorFieldTypeInformation<SparseFloatVector> type = new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, dimsize[out], labels, new SparseFloatVector(SparseFloatVector.EMPTYMAP, dimsize[out]));
+ VectorFieldTypeInformation<SparseFloatVector> type = new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<SparseFloatVector>());
}
}
- else if(etyp[out] == TypeUtil.LABELLIST) {
- String label = names.get(out);
+ else if(TypeUtil.LABELLIST.equals(etyp[out])) {
+ StringBuilder label = new StringBuilder(names.get(out));
for(int i = 1; i < dimsize[out]; i++) {
- label = label + " " + names.get(out + i);
+ label.append(' ').append(names.get(out + i));
}
- bundle.appendColumn(new SimpleTypeInformation<LabelList>(LabelList.class, label), new ArrayList<LabelList>());
+ bundle.appendColumn(new SimpleTypeInformation<LabelList>(LabelList.class, label.toString()), new ArrayList<LabelList>());
}
- else if(etyp[out] == TypeUtil.EXTERNALID) {
+ else if(TypeUtil.EXTERNALID.equals(etyp[out])) {
bundle.appendColumn(new SimpleTypeInformation<ExternalID>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
}
- else if(etyp[out] == TypeUtil.CLASSLABEL) {
+ else if(TypeUtil.CLASSLABEL.equals(etyp[out])) {
bundle.appendColumn(new SimpleTypeInformation<ClassLabel>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
}
else {
@@ -553,7 +553,7 @@ public class ArffParser implements Parser {
}
else if(ARFF_NUMERIC.matcher(types.get(i)).matches()) {
// Create a number vector field
- if(next > 0 && etyp[next - 1] == TypeUtil.NUMBER_VECTOR_FIELD) {
+ if(next > 0 && TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[next - 1])) {
targ[i] = next - 1;
dims[next - 1]++;
continue;
@@ -568,7 +568,7 @@ public class ArffParser implements Parser {
}
else {
// Use LabelList
- if(next > 0 && etyp[next - 1] == TypeUtil.LABELLIST) {
+ if(next > 0 && TypeUtil.LABELLIST.equals(etyp[next - 1])) {
targ[i] = next - 1;
dims[next - 1]++;
continue;
@@ -598,21 +598,21 @@ public class ArffParser implements Parser {
else if((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) {
tokenizer.ttype = '?';
}
- if(logger.isDebugging()) {
+ if(LOG.isDebugging()) {
if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) {
- logger.debug("token: " + tokenizer.nval);
+ LOG.debug("token: " + tokenizer.nval);
}
else if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
- logger.debug("token: " + tokenizer.sval);
+ LOG.debug("token: " + tokenizer.sval);
}
else if(tokenizer.ttype == StreamTokenizer.TT_EOF) {
- logger.debug("token: EOF");
+ LOG.debug("token: EOF");
}
else if(tokenizer.ttype == StreamTokenizer.TT_EOL) {
- logger.debug("token: EOL");
+ LOG.debug("token: EOL");
}
else {
- logger.debug("token type: " + tokenizer.ttype);
+ LOG.debug("token type: " + tokenizer.ttype);
}
}
}
@@ -628,12 +628,12 @@ public class ArffParser implements Parser {
/**
* Pattern for recognizing external ID attributes.
*/
- public static final OptionID MAGIC_EID_ID = OptionID.getOrCreateOptionID("arff.externalid", "Pattern to recognize external ID attributes.");
+ public static final OptionID MAGIC_EID_ID = new OptionID("arff.externalid", "Pattern to recognize external ID attributes.");
/**
* Pattern for recognizing class label attributes.
*/
- public static final OptionID MAGIC_CLASS_ID = OptionID.getOrCreateOptionID("arff.classlabel", "Pattern to recognize class label attributes.");
+ public static final OptionID MAGIC_CLASS_ID = new OptionID("arff.classlabel", "Pattern to recognize class label attributes.");
/**
* Pattern to recognize external ids
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
index c62392b4..32a26d7d 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java
@@ -28,7 +28,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
-import java.util.BitSet;
import java.util.List;
import java.util.regex.Pattern;
@@ -59,7 +58,7 @@ public class BitVectorLabelParser extends AbstractParser implements Parser {
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(BitVectorLabelParser.class);
+ private static final Logging LOG = Logging.getLogger(BitVectorLabelParser.class);
/**
* Constructor.
@@ -117,12 +116,12 @@ public class BitVectorLabelParser extends AbstractParser implements Parser {
}
protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<BitVector>(BitVector.class, dimensionality, new BitVector(new BitSet(), dimensionality));
+ return new VectorFieldTypeInformation<BitVector>(BitVector.FACTORY, dimensionality);
}
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
index 7bc52a29..0c291fb4 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java
@@ -56,7 +56,7 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(DoubleVectorLabelParser.class);
+ private static final Logging LOG = Logging.getLogger(DoubleVectorLabelParser.class);
/**
* Constructor.
@@ -66,7 +66,7 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto
* @param labelIndices
*/
public DoubleVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, DoubleVector.STATIC);
+ super(colSep, quoteChar, labelIndices, DoubleVector.FACTORY);
}
/**
@@ -78,7 +78,7 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
index 6465f89a..6288da8e 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java
@@ -59,7 +59,7 @@ public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector>
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(FloatVectorLabelParser.class);
+ private static final Logging LOG = Logging.getLogger(FloatVectorLabelParser.class);
/**
* Constructor.
@@ -69,12 +69,12 @@ public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector>
* @param labelIndices
*/
public FloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, FloatVector.STATIC);
+ super(colSep, quoteChar, labelIndices, FloatVector.FACTORY);
}
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index 53ce44bc..ea44c072 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -42,7 +42,6 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.persistent.ByteBufferSerializer;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -72,11 +71,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
*
* @param <V> the type of NumberVector used
*/
-public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends AbstractStreamingParser {
+public class NumberVectorLabelParser<V extends NumberVector<?>> extends AbstractStreamingParser {
/**
* Logging class.
*/
- private static final Logging logger = Logging.getLogger(NumberVectorLabelParser.class);
+ private static final Logging LOG = Logging.getLogger(NumberVectorLabelParser.class);
/**
* A comma separated list of the indices of labels (may be numeric), counting
@@ -86,7 +85,7 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
* Key: {@code -parser.labelIndices}
* </p>
*/
- public static final OptionID LABEL_INDICES_ID = OptionID.getOrCreateOptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label.");
+ public static final OptionID LABEL_INDICES_ID = new OptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label.");
/**
* Parameter to specify the type of vectors to produce.
@@ -95,7 +94,7 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
* Default: DoubleVector
* </p>
*/
- public static final OptionID VECTOR_TYPE_ID = OptionID.getOrCreateOptionID("parser.vector-type", "The type of vectors to create for numerical attributes.");
+ public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes.");
/**
* Constant used for unknown dimensionality (e.g. empty files)
@@ -113,73 +112,73 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
protected BitSet labelIndices;
/**
- * Vector factory class
+ * Vector factory class.
*/
- protected V factory;
+ protected NumberVector.Factory<V, ?> factory;
/**
- * Buffer reader
+ * Buffer reader.
*/
private BufferedReader reader;
/**
- * Current line number
+ * Current line number.
*/
protected int lineNumber;
/**
- * Dimensionality reported
+ * Dimensionality reported.
*/
protected int dimensionality;
/**
- * Metadata
+ * Metadata.
*/
protected BundleMeta meta = null;
/**
- * Column names
+ * Column names.
*/
protected List<String> columnnames = null;
/**
- * Bitset to indicate which columns are numeric
+ * Bitset to indicate which columns are not numeric.
*/
protected BitSet labelcolumns = null;
/**
- * Current vector
+ * Current vector.
*/
protected V curvec = null;
/**
- * Current labels
+ * Current labels.
*/
protected LabelList curlbl = null;
/**
- * Event to report next
+ * Event to report next.
*/
Event nextevent = null;
/**
- * Constructor with defaults
+ * Constructor with defaults.
*
* @param factory Vector factory
*/
- public NumberVectorLabelParser(V factory) {
+ public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) {
this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, null, factory);
}
/**
- * Constructor
+ * Constructor.
*
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quote character
+ * @param labelIndices Column indexes that are numeric.
* @param factory Vector factory
*/
- public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, V factory) {
+ public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
super(colSep, quoteChar);
this.labelIndices = labelIndices;
this.factory = factory;
@@ -192,6 +191,9 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
dimensionality = DIMENSIONALITY_UNKNOWN;
columnnames = null;
labelcolumns = new BitSet();
+ if (labelIndices != null) {
+ labelcolumns.or(labelIndices);
+ }
}
@Override
@@ -201,27 +203,26 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
@Override
public Event nextEvent() {
- if(nextevent != null) {
+ if (nextevent != null) {
Event ret = nextevent;
nextevent = null;
return ret;
}
try {
- for(String line; (line = reader.readLine()) != null; lineNumber++) {
- if(!line.startsWith(COMMENT) && line.length() > 0) {
+ for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ if (!line.startsWith(COMMENT) && line.length() > 0) {
parseLineInternal(line);
// Maybe a header column?
- if(curvec == null) {
+ if (curvec == null) {
continue;
}
- if(dimensionality == DIMENSIONALITY_UNKNOWN) {
+ if (dimensionality == DIMENSIONALITY_UNKNOWN) {
dimensionality = curvec.getDimensionality();
buildMeta();
nextevent = Event.NEXT_OBJECT;
return Event.META_CHANGED;
- }
- else if(dimensionality > 0) {
- if(dimensionality != curvec.getDimensionality()) {
+ } else if (dimensionality > 0) {
+ if (dimensionality != curvec.getDimensionality()) {
dimensionality = DIMENSIONALITY_VARIABLE;
buildMeta();
nextevent = Event.NEXT_OBJECT;
@@ -238,8 +239,7 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
reader.close();
reader = null;
return Event.END_OF_STREAM;
- }
- catch(IOException e) {
+ } catch (IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
}
@@ -248,12 +248,11 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
* Update the meta element.
*/
protected void buildMeta() {
- if(labelcolumns.cardinality() > 0) {
+ if (labelcolumns.cardinality() > 0 || (labelIndices != null && labelIndices.cardinality() > 0)) {
meta = new BundleMeta(2);
meta.add(getTypeInformation(dimensionality));
meta.add(TypeUtil.LABELLIST);
- }
- else {
+ } else {
meta = new BundleMeta(1);
meta.add(getTypeInformation(dimensionality));
}
@@ -261,10 +260,10 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
@Override
public Object data(int rnum) {
- if(rnum == 0) {
+ if (rnum == 0) {
return curvec;
}
- if(rnum == 1) {
+ if (rnum == 1) {
return curlbl;
}
throw new ArrayIndexOutOfBoundsException();
@@ -284,28 +283,32 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
LabelList labels = null;
Iterator<String> itr = entries.iterator();
- for(int i = 0; itr.hasNext(); i++) {
+ for (int i = 0; itr.hasNext(); i++) {
String ent = itr.next();
- if(!labelIndices.get(i)) {
+ if (labelIndices == null || !labelIndices.get(i)) {
try {
double attribute = Double.parseDouble(ent);
attributes.add(attribute);
continue;
- }
- catch(NumberFormatException e) {
+ } catch (NumberFormatException e) {
// Ignore attempt, add to labels below.
labelcolumns.set(i);
}
}
- if(labels == null) {
+ // Else: labels.
+ if (labels == null) {
labels = new LabelList(1);
}
- labels.add(ent);
+ // Make a new string, to not keep the whole file in memory!
+ labels.add(new String(ent));
}
// Maybe a label row?
- if(lineNumber == 1 && attributes.size() == 0) {
+ if (lineNumber == 1 && attributes.size() == 0) {
columnnames = labels;
labelcolumns.clear();
+ if (labelIndices != null) {
+ labelcolumns.or(labelIndices);
+ }
curvec = null;
curlbl = null;
return;
@@ -316,11 +319,11 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
}
/**
- * <p>
* Creates a database object of type V.
- * </p>
*
* @param attributes the attributes of the vector to create.
+ * @param adapter Array adapter
+ * @param <A> attribute type
* @return a RalVector of type V containing the given attribute values
*/
protected <A> V createDBObject(A attributes, NumberArrayAdapter<?, A> adapter) {
@@ -334,47 +337,31 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
* @return Prototype object
*/
SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
- @SuppressWarnings("unchecked")
- Class<V> cls = (Class<V>) factory.getClass();
- if(dimensionality > 0) {
+ if (dimensionality > 0) {
String[] colnames = null;
- if(columnnames != null) {
- if(columnnames.size() - labelcolumns.cardinality() == dimensionality) {
+ if (columnnames != null) {
+ if (columnnames.size() - labelcolumns.cardinality() == dimensionality) {
colnames = new String[dimensionality];
- for(int i = 0, j = 0; i < columnnames.size(); i++) {
- if(labelcolumns.get(i) == false) {
+ for (int i = 0, j = 0; i < columnnames.size(); i++) {
+ if (!labelcolumns.get(i)) {
colnames[j] = columnnames.get(i);
j++;
}
}
}
}
- V f = factory.newNumberVector(new double[dimensionality]);
- if(f instanceof ByteBufferSerializer) {
- // TODO: Remove, once we have serializers for all types
- @SuppressWarnings("unchecked")
- final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f;
- return new VectorFieldTypeInformation<V>(cls, ser, dimensionality, colnames, f);
- }
- return new VectorFieldTypeInformation<V>(cls, dimensionality, colnames, f);
+ return new VectorFieldTypeInformation<V>(factory, dimensionality, colnames);
}
// Variable dimensionality - return non-vector field type
- if(dimensionality == DIMENSIONALITY_VARIABLE) {
- V f = factory.newNumberVector(new double[0]);
- if(f instanceof ByteBufferSerializer) {
- // TODO: Remove, once we have serializers for all types
- @SuppressWarnings("unchecked")
- final ByteBufferSerializer<V> ser = (ByteBufferSerializer<V>) f;
- return new SimpleTypeInformation<V>(cls, ser);
- }
- return new SimpleTypeInformation<V>(cls);
+ if (dimensionality == DIMENSIONALITY_VARIABLE) {
+ return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
@@ -384,16 +371,16 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParser.Parameterizer {
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParser.Parameterizer {
/**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices = null;
/**
- * Factory
+ * Factory object.
*/
- protected V factory;
+ protected NumberVector.Factory<V, ?> factory;
@Override
protected void makeOptions(Parameterization config) {
@@ -402,21 +389,31 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
getFactory(config);
}
+ /**
+ * Get the object factory.
+ *
+ * @param config Parameterization
+ */
protected void getFactory(Parameterization config) {
- ObjectParameter<V> factoryP = new ObjectParameter<V>(VECTOR_TYPE_ID, NumberVector.class, DoubleVector.class);
- if(config.grab(factoryP)) {
+ ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<NumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
+ if (config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
}
+ /**
+ * Get the label indices.
+ *
+ * @param config Parameterization
+ */
protected void getLabelIndices(Parameterization config) {
IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true);
- labelIndices = new BitSet();
- if(config.grab(labelIndicesP)) {
+ if (config.grab(labelIndicesP)) {
+ labelIndices = new BitSet();
List<Integer> labelcols = labelIndicesP.getValue();
- for(Integer idx : labelcols) {
- labelIndices.set(idx);
+ for (Integer idx : labelcols) {
+ labelIndices.set(idx.intValue());
}
}
}
@@ -426,4 +423,4 @@ public class NumberVectorLabelParser<V extends NumberVector<V, ?>> extends Abstr
return new NumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
index 1f078d19..5b511a92 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java
@@ -36,7 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
*
* @apiviz.landmark
* @apiviz.uses InputStream
- * @apiviz.uses MultipleObjectsBundle oneway - - «create»
+ * @apiviz.has MultipleObjectsBundle oneway - - «create»
*/
public interface Parser extends Parameterizable, InspectionUtilFrequentlyScanned {
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
index baa6f0ec..ce366b9e 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java
@@ -41,7 +41,10 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.StringLengthConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter;
/**
* Parser to load polygon data (2D and 3D only) from a simple format. One record
@@ -59,7 +62,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(SimplePolygonParser.class);
+ private static final Logging LOG = Logging.getLogger(SimplePolygonParser.class);
/**
* Pattern to catch coordinates
@@ -132,7 +135,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
ExternalID eid = null;
LabelList labels = null;
- List<Polygon> polys = new java.util.Vector<Polygon>(1);
+ List<Polygon> polys = new ArrayList<Polygon>(1);
List<Vector> coords = new ArrayList<Vector>();
while(iter.hasNext()) {
@@ -152,7 +155,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
continue;
}
catch(NumberFormatException e) {
- logger.warning("Looked like a coordinate pair but didn't parse: " + cur);
+ LOG.warning("Looked like a coordinate pair but didn't parse: " + cur);
}
}
// Polygon separator.
@@ -184,7 +187,7 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
@@ -197,7 +200,15 @@ public class SimplePolygonParser extends AbstractParser implements Parser {
public static class Parameterizer extends AbstractParser.Parameterizer {
@Override
protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
+ PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, "\\s+");
+ if(config.grab(colParam)) {
+ colSep = colParam.getValue();
+ }
+ StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR));
+ quoteParam.addConstraint(new StringLengthConstraint(1, 1));
+ if(config.grab(quoteParam)) {
+ quoteChar = quoteParam.getValue().charAt(0);
+ }
}
@Override
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
index 70188d38..35e53bb7 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java
@@ -40,7 +40,6 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
/**
* Provides a parser for parsing one sparse BitVector per line, where the
@@ -60,7 +59,7 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
/**
* Class logger
*/
- private static final Logging logger = Logging.getLogger(SparseBitVectorLabelParser.class);
+ private static final Logging LOG = Logging.getLogger(SparseBitVectorLabelParser.class);
/**
* Constructor.
@@ -90,7 +89,7 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
for(String entry : entries) {
try {
- Integer index = Integer.valueOf(entry);
+ int index = Integer.parseInt(entry);
bitSet.set(index);
dimensionality = Math.max(dimensionality, index);
}
@@ -122,12 +121,12 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
}
protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) {
- return new VectorFieldTypeInformation<BitVector>(BitVector.class, dimensionality, new BitVector(new BitSet(), dimensionality));
+ return new VectorFieldTypeInformation<BitVector>(BitVector.FACTORY, dimensionality);
}
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
@@ -139,11 +138,6 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser
*/
public static class Parameterizer extends AbstractParser.Parameterizer {
@Override
- protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
- }
-
- @Override
protected SparseBitVectorLabelParser makeInstance() {
return new SparseBitVectorLabelParser(colSep, quoteChar);
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
index d0ab7a85..9f658b0a 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java
@@ -78,7 +78,7 @@ public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser<
* @param labelIndices Label indexes
*/
public SparseFloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, SparseFloatVector.STATIC);
+ super(colSep, quoteChar, labelIndices, SparseFloatVector.FACTORY);
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
index 917dd2aa..f4ec8c59 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java
@@ -30,7 +30,6 @@ import java.util.List;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
-import de.lmu.ifi.dbs.elki.data.SparseDoubleVector;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
@@ -74,15 +73,17 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Arthur Zimek
*
* @apiviz.has SparseNumberVector
+ *
+ * @param <V> vector type
*/
// FIXME: Maxdim!
@Title("Sparse Vector Label Parser")
@Description("Parser for the following line format:\n" + "A single line provides a single point. Entries are separated by whitespace. " + "The values will be parsed as floats (resulting in a set of SparseFloatVectors). A line is expected in the following format: The first entry of each line is the number of attributes with coordinate value not zero. Subsequent entries are of the form (index, value), where index is the number of the corresponding dimension, and value is the value of the corresponding attribute." + "Any pair of two subsequent substrings not containing whitespace is tried to be read as int and float. If this fails for the first of the pair (interpreted ans index), it will be appended to a label. (Thus, any label must not be parseable as Integer.) If the float component is not parseable, an exception will be thrown. Empty lines and lines beginning with \"#\" will be ignored.")
-public class SparseNumberVectorLabelParser<V extends SparseNumberVector<V, ?>> extends NumberVectorLabelParser<V> {
+public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> extends NumberVectorLabelParser<V> {
/**
- * Class logger
+ * Class logger.
*/
- private static final Logging logger = Logging.getLogger(SparseNumberVectorLabelParser.class);
+ private static final Logging LOG = Logging.getLogger(SparseNumberVectorLabelParser.class);
/**
* Holds the dimensionality of the parsed data which is the maximum occurring
@@ -91,6 +92,11 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<V, ?>> e
private int maxdim = -1;
/**
+ * Same as {@link #factory}, but subtype.
+ */
+ private SparseNumberVector.Factory<V, ?> sparsefactory;
+
+ /**
* Constructor.
*
* @param colSep Column separator
@@ -98,8 +104,9 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<V, ?>> e
* @param labelIndices Label indexes
* @param factory Vector factory
*/
- public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, V factory) {
+ public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
super(colSep, quoteChar, labelIndices, factory);
+ this.sparsefactory = factory;
}
@Override
@@ -110,55 +117,51 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<V, ?>> e
TIntDoubleHashMap values = new TIntDoubleHashMap(cardinality, 1);
LabelList labels = null;
- for(int i = 1; i < entries.size() - 1; i++) {
- if(!labelIndices.get(i)) {
+ for (int i = 1; i < entries.size() - 1; i++) {
+ if (labelIndices == null || !labelIndices.get(i)) {
try {
- int index = Integer.valueOf(entries.get(i));
- if(index >= maxdim) {
+ int index = Integer.parseInt(entries.get(i));
+ if (index >= maxdim) {
maxdim = index + 1;
}
- double attribute = Double.valueOf(entries.get(i));
+ double attribute = Double.parseDouble(entries.get(i));
values.put(index, attribute);
i++;
- }
- catch(NumberFormatException e) {
- if(labels == null) {
+ } catch (NumberFormatException e) {
+ if (labels == null) {
labels = new LabelList(1);
}
labels.add(entries.get(i));
continue;
}
- }
- else {
- if(labels == null) {
+ } else {
+ if (labels == null) {
labels = new LabelList(1);
}
labels.add(entries.get(i));
}
}
- if(values.size() > maxdim) {
+ if (values.size() > maxdim) {
throw new AbortException("Invalid sparse vector seen: " + line);
}
- curvec = factory.newNumberVector(values, maxdim);
+ curvec = sparsefactory.newNumberVector(values, maxdim);
curlbl = labels;
}
@Override
protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
- @SuppressWarnings("unchecked")
- Class<V> cls = (Class<V>) factory.getClass();
- if(dimensionality > 0) {
- return new VectorFieldTypeInformation<V>(cls, dimensionality, factory.newNumberVector(SparseDoubleVector.EMPTYMAP, dimensionality));
+ if (dimensionality > 0) {
+ return new VectorFieldTypeInformation<V>(factory, dimensionality);
}
- if(dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<V>(cls);
+ if (dimensionality == DIMENSIONALITY_VARIABLE) {
+ return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
@@ -168,18 +171,18 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<V, ?>> e
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends SparseNumberVector<V, ?>> extends NumberVectorLabelParser.Parameterizer<V> {
+ public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<V> factoryP = new ObjectParameter<V>(VECTOR_TYPE_ID, SparseNumberVector.class, SparseFloatVector.class);
- if(config.grab(factoryP)) {
+ ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<SparseNumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ if (config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
}
@Override
protected SparseNumberVectorLabelParser<V> makeInstance() {
- return new SparseNumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory);
+ return new SparseNumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
index bb277b17..2ea6ebb5 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java
@@ -23,8 +23,8 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import gnu.trove.iterator.TIntFloatIterator;
-import gnu.trove.map.hash.TIntFloatHashMap;
+import gnu.trove.iterator.TIntDoubleIterator;
+import gnu.trove.map.hash.TIntDoubleHashMap;
import java.util.BitSet;
import java.util.HashMap;
@@ -33,6 +33,7 @@ import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
+import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -42,6 +43,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* A parser to load term frequency data, which essentially are sparse vectors
@@ -53,28 +55,33 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
*/
@Title("Term frequency parser")
@Description("Parse a file containing term frequencies. The expected format is 'label term1 <freq> term2 <freq> ...'. Terms must not contain the separator character!")
-public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVector> {
+public class TermFrequencyParser<V extends SparseNumberVector<?>> extends NumberVectorLabelParser<V> {
/**
- * Class logger
+ * Class logger.
*/
- private static final Logging logger = Logging.getLogger(TermFrequencyParser.class);
+ private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class);
/**
- * Maximum dimension used
+ * Maximum dimension used.
*/
int maxdim;
/**
- * Map
+ * Map.
*/
HashMap<String, Integer> keymap;
/**
- * Normalize
+ * Normalize.
*/
boolean normalize;
/**
+ * Same as {@link #factory}, but subtype.
+ */
+ private SparseNumberVector.Factory<V, ?> sparsefactory;
+
+ /**
* Constructor.
*
* @param normalize Normalize
@@ -82,11 +89,12 @@ public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVect
* @param quoteChar
* @param labelIndices
*/
- public TermFrequencyParser(boolean normalize, Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, SparseFloatVector.STATIC);
+ public TermFrequencyParser(boolean normalize, Pattern colSep, char quoteChar, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ super(colSep, quoteChar, labelIndices, factory);
this.normalize = normalize;
this.maxdim = 0;
this.keymap = new HashMap<String, Integer>();
+ this.sparsefactory = factory;
}
@Override
@@ -94,30 +102,28 @@ public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVect
List<String> entries = tokenize(line);
double len = 0;
- TIntFloatHashMap values = new TIntFloatHashMap();
+ TIntDoubleHashMap values = new TIntDoubleHashMap();
LabelList labels = null;
String curterm = null;
- for(int i = 0; i < entries.size(); i++) {
- if(curterm == null) {
+ for (int i = 0; i < entries.size(); i++) {
+ if (curterm == null) {
curterm = entries.get(i);
- }
- else {
+ } else {
try {
- float attribute = Float.valueOf(entries.get(i));
+ double attribute = Double.parseDouble(entries.get(i));
Integer curdim = keymap.get(curterm);
- if(curdim == null) {
- curdim = maxdim + 1;
+ if (curdim == null) {
+ curdim = Integer.valueOf(maxdim + 1);
keymap.put(curterm, curdim);
maxdim += 1;
}
values.put(curdim, attribute);
len += attribute;
curterm = null;
- }
- catch(NumberFormatException e) {
- if(curterm != null) {
- if(labels == null) {
+ } catch (NumberFormatException e) {
+ if (curterm != null) {
+ if (labels == null) {
labels = new LabelList(1);
}
labels.add(curterm);
@@ -126,39 +132,39 @@ public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVect
}
}
}
- if(curterm != null) {
- if(labels == null) {
+ if (curterm != null) {
+ if (labels == null) {
labels = new LabelList(1);
}
labels.add(curterm);
}
- if(normalize) {
- if(Math.abs(len - 1.0) > 1E-10 && len > 1E-10) {
- for(TIntFloatIterator iter = values.iterator(); iter.hasNext();) {
+ if (normalize) {
+ if (Math.abs(len - 1.0) > 1E-10 && len > 1E-10) {
+ for (TIntDoubleIterator iter = values.iterator(); iter.hasNext();) {
iter.advance();
- iter.setValue((float) (iter.value() / len));
+ iter.setValue(iter.value() / len);
}
}
}
- curvec = new SparseFloatVector(values, maxdim);
+ curvec = sparsefactory.newNumberVector(values, maxdim);
curlbl = labels;
}
@Override
- protected SimpleTypeInformation<SparseFloatVector> getTypeInformation(int dimensionality) {
- if(dimensionality > 0) {
- return new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.class, dimensionality, new SparseFloatVector(SparseFloatVector.EMPTYMAP, dimensionality));
+ protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
+ if (dimensionality > 0) {
+ return new VectorFieldTypeInformation<V>(factory, dimensionality);
}
- if(dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<SparseFloatVector>(SparseFloatVector.class);
+ if (dimensionality == DIMENSIONALITY_VARIABLE) {
+ return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
@@ -168,14 +174,14 @@ public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVect
*
* @apiviz.exclude
*/
- public static class Parameterizer extends NumberVectorLabelParser.Parameterizer<SparseFloatVector> {
+ public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
/**
- * Option ID for normalization
+ * Option ID for normalization.
*/
- public static final OptionID NORMALIZE_FLAG = OptionID.getOrCreateOptionID("tf.normalize", "Normalize vectors to manhattan length 1 (convert term counts to term frequencies)");
+ public static final OptionID NORMALIZE_FLAG = new OptionID("tf.normalize", "Normalize vectors to manhattan length 1 (convert term counts to term frequencies)");
/**
- * Normalization flag
+ * Normalization flag.
*/
boolean normalize = false;
@@ -183,14 +189,22 @@ public class TermFrequencyParser extends NumberVectorLabelParser<SparseFloatVect
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
Flag normF = new Flag(NORMALIZE_FLAG);
- if(config.grab(normF)) {
- normalize = normF.getValue();
+ if (config.grab(normF)) {
+ normalize = normF.getValue().booleanValue();
+ }
+ }
+
+ @Override
+ protected void getFactory(Parameterization config) {
+ ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<SparseNumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ if (config.grab(factoryP)) {
+ factory = factoryP.instantiateClass(config);
}
}
@Override
- protected TermFrequencyParser makeInstance() {
- return new TermFrequencyParser(normalize, colSep, quoteChar, labelIndices);
+ protected TermFrequencyParser<V> makeInstance() {
+ return new TermFrequencyParser<V>(normalize, colSep, quoteChar, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
index f1999262..58ae9a77 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java
@@ -34,6 +34,9 @@
* <p>As an example file following these requirements consider e.g.:
* <a href="http://www.dbs.ifi.lmu.de/research/KDD/ELKI/datasets/example/exampledata.txt">exampledata.txt</a>
* </p>
+ *
+ * @apiviz.exclude java.io.*
+ * @apiviz.exclude de.lmu.ifi.dbs.elki.utilities.*
*/
/*
This file is part of ELKI:
@@ -57,4 +60,4 @@ GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-package de.lmu.ifi.dbs.elki.datasource.parser; \ No newline at end of file
+package de.lmu.ifi.dbs.elki.datasource.parser;