summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
diff options
context:
space:
mode:
authorAndrej Shadura <andrewsh@debian.org>2019-03-09 22:30:38 +0000
committerAndrej Shadura <andrewsh@debian.org>2019-03-09 22:30:38 +0000
commit14a486343aef55f97f54082d6b542dedebf6f3ba (patch)
tree000fcc4968578771ad265079eef7617d66de2cda /src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
parent8300861dc4c62c5567a4e654976072f854217544 (diff)
Import Upstream version 0.6.0
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java167
1 files changed, 96 insertions, 71 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
index 39da752b..3fe4af09 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java
@@ -29,8 +29,9 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.util.ArrayList;
import java.util.BitSet;
-import java.util.Iterator;
+import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
@@ -79,25 +80,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
private static final Logging LOG = Logging.getLogger(NumberVectorLabelParser.class);
/**
- * A comma separated list of the indices of labels (may be numeric), counting
- * whitespace separated entries in a line starting with 0. The corresponding
- * entries will be treated as a label.
- * <p>
- * Key: {@code -parser.labelIndices}
- * </p>
- */
- public static final OptionID LABEL_INDICES_ID = new OptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label.");
-
- /**
- * Parameter to specify the type of vectors to produce.
- * <p>
- * Key: {@code -parser.vector-type}<br />
- * Default: DoubleVector
- * </p>
- */
- public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes.");
-
- /**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices;
@@ -138,6 +120,11 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
protected BitSet labelcolumns = null;
/**
+ * Whether or not the data set has labels.
+ */
+ protected boolean haslabels = false;
+
+ /**
* Current vector.
*/
protected V curvec = null;
@@ -148,6 +135,21 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
protected LabelList curlbl = null;
/**
+ * (Reused) store for numerical attributes.
+ */
+ final TDoubleArrayList attributes = new TDoubleArrayList();
+
+ /**
+ * (Reused) store for labels.
+ */
+ final ArrayList<String> labels = new ArrayList<>();
+
+ /**
+ * For String unification.
+ */
+ HashMap<String, String> unique = new HashMap<>();
+
+ /**
* Event to report next.
*/
Event nextevent = null;
@@ -158,20 +160,20 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param factory Vector factory
*/
public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) {
- this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, Pattern.compile(COMMENT_PATTERN), null, factory);
+ this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHARS, Pattern.compile(COMMENT_PATTERN), null, factory);
}
/**
* Constructor.
*
* @param colSep Column separator
- * @param quoteChar Quote character
+ * @param quoteChars Quote character
* @param comment Comment pattern
* @param labelIndices Column indexes that are numeric.
* @param factory Vector factory
*/
- public NumberVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
- super(colSep, quoteChar, comment);
+ public NumberVectorLabelParser(Pattern colSep, String quoteChars, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) {
+ super(colSep, quoteChars, comment);
this.labelIndices = labelIndices;
this.factory = factory;
}
@@ -183,8 +185,9 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
mindim = Integer.MAX_VALUE;
maxdim = 0;
columnnames = null;
+ haslabels = false;
labelcolumns = new BitSet();
- if (labelIndices != null) {
+ if(labelIndices != null) {
labelcolumns.or(labelIndices);
}
}
@@ -196,36 +199,31 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
@Override
public Event nextEvent() {
- if (nextevent != null) {
+ if(nextevent != null) {
Event ret = nextevent;
nextevent = null;
return ret;
}
try {
- for (String line; (line = reader.readLine()) != null; lineNumber++) {
+ for(String line; (line = reader.readLine()) != null; lineNumber++) {
// Skip empty lines and comments
- if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
+ if(line.length() <= 0 || (comment != null && comment.matcher(line).matches())) {
continue;
}
parseLineInternal(line);
// Maybe a header column?
- if (curvec == null) {
+ if(curvec == null) {
continue;
}
final int curdim = curvec.getDimensionality();
- if (maxdim < mindim) {
- mindim = curdim;
- maxdim = curdim;
- buildMeta();
- nextevent = Event.NEXT_OBJECT;
- return Event.META_CHANGED;
- } else if (mindim < curdim || maxdim > curdim) {
+ if(curdim > maxdim || mindim > curdim) {
mindim = Math.min(mindim, curdim);
maxdim = Math.max(maxdim, curdim);
buildMeta();
nextevent = Event.NEXT_OBJECT;
return Event.META_CHANGED;
- } else if (curlbl != null && meta != null && meta.size() == 1) {
+ }
+ else if(curlbl != null && meta != null && meta.size() == 1) {
buildMeta();
nextevent = Event.NEXT_OBJECT;
return Event.META_CHANGED;
@@ -234,8 +232,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
}
reader.close();
reader = null;
+ unique.clear();
return Event.END_OF_STREAM;
- } catch (IOException e) {
+ }
+ catch(IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
}
@@ -244,11 +244,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* Update the meta element.
*/
protected void buildMeta() {
- if (labelcolumns.cardinality() > 0 || (labelIndices != null && labelIndices.cardinality() > 0)) {
+ if(haslabels) {
meta = new BundleMeta(2);
meta.add(getTypeInformation(mindim, maxdim));
meta.add(TypeUtil.LABELLIST);
- } else {
+ }
+ else {
meta = new BundleMeta(1);
meta.add(getTypeInformation(mindim, maxdim));
}
@@ -256,10 +257,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
@Override
public Object data(int rnum) {
- if (rnum == 0) {
+ if(rnum == 0) {
return curvec;
}
- if (rnum == 1) {
+ if(rnum == 1) {
return curlbl;
}
throw new ArrayIndexOutOfBoundsException();
@@ -273,45 +274,48 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @param line Line to process
*/
protected void parseLineInternal(String line) {
- List<String> entries = tokenize(line);
- // Split into numerical attributes and labels
- TDoubleArrayList attributes = new TDoubleArrayList(entries.size());
- LabelList labels = null;
+ attributes.reset();
+ labels.clear();
- Iterator<String> itr = entries.iterator();
- for (int i = 0; itr.hasNext(); i++) {
- String ent = itr.next();
- if (labelIndices == null || !labelIndices.get(i)) {
+ // Split into numerical attributes and labels
+ int i = 0;
+ for(tokenizer.initialize(line, 0, lengthWithoutLinefeed(line)); tokenizer.valid(); tokenizer.advance(), i++) {
+ if(labelIndices == null || !labelIndices.get(i)) {
try {
- double attribute = parseDouble(ent);
+ double attribute = tokenizer.getDouble();
attributes.add(attribute);
continue;
- } catch (NumberFormatException e) {
+ }
+ catch(NumberFormatException e) {
// Ignore attempt, add to labels below.
labelcolumns.set(i);
}
}
// Else: labels.
- if (labels == null) {
- labels = new LabelList(1);
+ haslabels = true;
+ final String lbl = tokenizer.getSubstring();
+ String u = unique.get(lbl);
+ if(u == null) {
+ u = lbl;
+ unique.put(u, u);
}
- // Make a new string, to not keep the whole file in memory!
- labels.add(new String(ent));
+ labels.add(u);
}
// Maybe a label row?
- if (lineNumber == 1 && attributes.size() == 0) {
- columnnames = labels;
+ if(lineNumber == 1 && attributes.size() == 0) {
+ columnnames = new ArrayList<>(labels);
labelcolumns.clear();
- if (labelIndices != null) {
+ if(labelIndices != null) {
labelcolumns.or(labelIndices);
}
curvec = null;
curlbl = null;
+ haslabels = false;
return;
}
// Pass outside via class variables
curvec = createDBObject(attributes, ArrayLikeUtil.TDOUBLELISTADAPTER);
- curlbl = labels;
+ curlbl = LabelList.make(labels);
}
/**
@@ -334,13 +338,13 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
* @return Prototype object
*/
SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
- if (mindim == maxdim) {
+ if(mindim == maxdim) {
String[] colnames = null;
- if (columnnames != null) {
- if (columnnames.size() - labelcolumns.cardinality() == mindim) {
+ if(columnnames != null) {
+ if(columnnames.size() - labelcolumns.cardinality() == mindim) {
colnames = new String[mindim];
- for (int i = 0, j = 0; i < columnnames.size(); i++) {
- if (!labelcolumns.get(i)) {
+ for(int i = 0, j = 0; i < columnnames.size(); i++) {
+ if(!labelcolumns.get(i)) {
colnames[j] = columnnames.get(i);
j++;
}
@@ -348,10 +352,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
}
}
return new VectorFieldTypeInformation<>(factory, mindim, colnames);
- } else if (mindim < maxdim) {
+ }
+ else if(mindim < maxdim) {
// Variable dimensionality - return non-vector field type
return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
- } else {
+ }
+ else {
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
}
@@ -370,6 +376,25 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*/
public static class Parameterizer<V extends NumberVector<?>> extends AbstractParser.Parameterizer {
/**
+ * A comma separated list of the indices of labels (may be numeric),
+ * counting whitespace separated entries in a line starting with 0. The
+ * corresponding entries will be treated as a label.
+ * <p>
+ * Key: {@code -parser.labelIndices}
+ * </p>
+ */
+ public static final OptionID LABEL_INDICES_ID = new OptionID("parser.labelIndices", "A comma separated list of the indices of labels (may be numeric), counting whitespace separated entries in a line starting with 0. The corresponding entries will be treated as a label.");
+
+ /**
+ * Parameter to specify the type of vectors to produce.
+ * <p>
+ * Key: {@code -parser.vector-type}<br />
+ * Default: DoubleVector
+ * </p>
+ */
+ public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes.");
+
+ /**
* Keeps the indices of the attributes to be treated as a string label.
*/
protected BitSet labelIndices = null;
@@ -393,7 +418,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
*/
protected void getFactory(Parameterization config) {
ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class);
- if (config.grab(factoryP)) {
+ if(config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
}
@@ -406,10 +431,10 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
protected void getLabelIndices(Parameterization config) {
IntListParameter labelIndicesP = new IntListParameter(LABEL_INDICES_ID, true);
- if (config.grab(labelIndicesP)) {
+ if(config.grab(labelIndicesP)) {
labelIndices = new BitSet();
List<Integer> labelcols = labelIndicesP.getValue();
- for (Integer idx : labelcols) {
+ for(Integer idx : labelcols) {
labelIndices.set(idx.intValue());
}
}
@@ -417,7 +442,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract
@Override
protected NumberVectorLabelParser<V> makeInstance() {
- return new NumberVectorLabelParser<>(colSep, quoteChar, comment, labelIndices, factory);
+ return new NumberVectorLabelParser<>(colSep, quoteChars, comment, labelIndices, factory);
}
}
}