diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser')
16 files changed, 530 insertions, 315 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java index 1f414055..1e689638 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -54,7 +54,8 @@ public abstract class AbstractParser { public static final char QUOTE_CHAR = '\"'; /** - * A pattern catching most numbers that can be parsed using Double.parseDouble: + * A pattern catching most numbers that can be parsed using + * Double.parseDouble: * * Some examples: <code>1</code> <code>1.</code> <code>1.2</code> * <code>.2</code> <code>-.2e-03</code> @@ -62,16 +63,14 @@ public abstract class AbstractParser { public static final String NUMBER_PATTERN = "[+-]?(?:\\d+\\.?|\\d*\\.\\d+)?(?:[eE][-]?\\d+)?"; /** - * OptionID for the column separator parameter (defaults to whitespace as in - * {@link #DEFAULT_SEPARATOR}. + * Default pattern for comments. */ - public static final OptionID COLUMN_SEPARATOR_ID = new OptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data."); + public static final String COMMENT_PATTERN = "^\\s*(#|//|;).*$"; /** - * OptionID for the quote character parameter (defaults to a double quotation - * mark as in {@link #QUOTE_CHAR}. + * A sign to separate attributes. */ - public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote."); + public static final String ATTRIBUTE_CONCATENATION = " "; /** * Stores the column separator pattern @@ -84,25 +83,22 @@ public abstract class AbstractParser { protected char quoteChar = QUOTE_CHAR; /** - * The comment character. + * Comment pattern. */ - public static final String COMMENT = "#"; - - /** - * A sign to separate attributes. - */ - public static final String ATTRIBUTE_CONCATENATION = " "; + protected Pattern comment = null; /** * Constructor. * * @param colSep Column separator * @param quoteChar Quote character + * @param comment Comment pattern */ - public AbstractParser(Pattern colSep, char quoteChar) { + public AbstractParser(Pattern colSep, char quoteChar, Pattern comment) { super(); this.colSep = colSep; this.quoteChar = quoteChar; + this.comment = comment; } /** @@ -113,16 +109,16 @@ public abstract class AbstractParser { * @return Tokenized string */ protected List<String> tokenize(String input) { - ArrayList<String> matchList = new ArrayList<String>(); + ArrayList<String> matchList = new ArrayList<>(); Matcher m = colSep.matcher(input); int index = 0; boolean inquote = (input.length() > 0) && (input.charAt(0) == quoteChar); - while(m.find()) { + while (m.find()) { // Quoted code path vs. regular code path - if(inquote && m.start() > 0) { + if (inquote && m.start() > 0) { // Closing quote found? - if(m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) { + if (m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) { // Strip quote characters if (index + 1 < m.start() - 1) { matchList.add(input.substring(index + 1, m.start() - 1)); @@ -132,8 +128,7 @@ public abstract class AbstractParser { // new quote? inquote = (index < input.length()) && (input.charAt(index) == quoteChar); } - } - else { + } else { // Add match before separator if (index < m.start()) { matchList.add(input.substring(index, m.start())); @@ -145,25 +140,23 @@ public abstract class AbstractParser { } } // Nothing found - return original string. - if(index == 0) { + if (index == 0) { matchList.add(input); return matchList; } // Add tail after last separator. - if(inquote) { - if(input.charAt(input.length() - 1) == quoteChar) { + if (inquote) { + if (input.charAt(input.length() - 1) == quoteChar) { if (index + 1 < input.length() - 1) { matchList.add(input.substring(index + 1, input.length() - 1)); } - } - else { - getLogger().warning("Invalid quoted line in input."); + } else { + getLogger().warning("Invalid quoted line in input: no closing quote found in: " + input); if (index < input.length()) { matchList.add(input.substring(index, input.length())); } } - } - else { + } else { if (index < input.length()) { matchList.add(input.substring(index, input.length())); } @@ -190,6 +183,43 @@ public abstract class AbstractParser { } /** + * Utility function, which is a bit more robust wrt. parsing double values. In + * particular: infinite values, and creates fewer objects. + * + * @param s String s + * @return parsed value + * @throws NumberFormatException + */ + public static double parseDouble(String s) throws NumberFormatException { + try { + return Double.parseDouble(s); + } catch (NumberFormatException e) { + int len = s.length(); + if (len > 0) { + int p = 0; + char cur = s.charAt(p); + boolean isNegative = cur == '-'; + if (isNegative && ++p < len) { + cur = s.charAt(p); + } + if (cur == '∞') { + return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } + if (len - p == 3 && "Inf".regionMatches(true, 0, s, p, 3)) { + return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } + if (len - p == 8 && "Infinity".regionMatches(true, 0, s, p, 8)) { + return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } + if (len == 3 && "NaN".equalsIgnoreCase(s)) { + return Double.NaN; + } + } + throw e; + } + } + + /** * Parameterization class. * * @author Erich Schubert @@ -198,6 +228,23 @@ public abstract class AbstractParser { */ public abstract static class Parameterizer extends AbstractParameterizer { /** + * OptionID for the column separator parameter (defaults to whitespace as in + * {@link #DEFAULT_SEPARATOR}. + */ + public static final OptionID COLUMN_SEPARATOR_ID = new OptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data."); + + /** + * OptionID for the quote character parameter (defaults to a double + * quotation mark as in {@link #QUOTE_CHAR}. + */ + public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote."); + + /** + * Comment pattern. + */ + public static final OptionID COMMENT_ID = new OptionID("string.comment", "Ignore lines in the input file that satisfy this pattern."); + + /** * Stores the column separator pattern */ protected Pattern colSep = null; @@ -207,21 +254,30 @@ public abstract class AbstractParser { */ protected char quoteChar = QUOTE_CHAR; + /** + * Comment pattern. + */ + protected Pattern comment = null; + @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, DEFAULT_SEPARATOR); - if(config.grab(colParam)) { + if (config.grab(colParam)) { colSep = colParam.getValue(); } StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR)); quoteParam.addConstraint(new StringLengthConstraint(1, 1)); - if(config.grab(quoteParam)) { + if (config.grab(quoteParam)) { quoteChar = quoteParam.getValue().charAt(0); } + PatternParameter commentP = new PatternParameter(COMMENT_ID, COMMENT_PATTERN); + if (config.grab(commentP)) { + comment = commentP.getValue(); + } } @Override protected abstract AbstractParser makeInstance(); } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java index 79f17326..53b4b6e8 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractStreamingParser.java @@ -1,15 +1,10 @@ package de.lmu.ifi.dbs.elki.datasource.parser; -import java.io.InputStream; -import java.util.regex.Pattern; - -import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,6 +22,12 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +import java.io.InputStream; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; + /** * Base class for streaming parsers. * @@ -38,9 +39,10 @@ public abstract class AbstractStreamingParser extends AbstractParser implements * * @param colSep Column separator pattern * @param quoteChar Quote character + * @param comment Comment pattern */ - public AbstractStreamingParser(Pattern colSep, char quoteChar) { - super(colSep, quoteChar); + public AbstractStreamingParser(Pattern colSep, char quoteChar, Pattern comment) { + super(colSep, quoteChar, comment); } @Override @@ -48,4 +50,4 @@ public abstract class AbstractStreamingParser extends AbstractParser implements this.initStream(in); return MultipleObjectsBundle.fromStream(this); } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java index d1280fbe..718963d1 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,7 +24,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; */ import gnu.trove.iterator.TIntObjectIterator; -import gnu.trove.map.hash.TIntFloatHashMap; +import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import java.io.BufferedReader; @@ -41,7 +41,7 @@ import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.ExternalID; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.SimpleClassLabel; -import de.lmu.ifi.dbs.elki.data.SparseFloatVector; +import de.lmu.ifi.dbs.elki.data.SparseDoubleVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -60,7 +60,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; * This parser is quite hackish, and contains lots of not yet configurable * magic. * - * TODO: Sparse vectors are not yet fully supported. + * TODO: Allow configuration of the vector types (double, float) + * + * TODO: when encountering integer columns, produce integer vectors. + * + * TODO: allow optional class labels. * * @author Erich Schubert */ @@ -146,8 +150,8 @@ public class ArffParser implements Parser { public MultipleObjectsBundle parse(InputStream instream) { try { BufferedReader br = new BufferedReader(new InputStreamReader(instream)); - ArrayList<String> names = new ArrayList<String>(); - ArrayList<String> types = new ArrayList<String>(); + ArrayList<String> names = new ArrayList<>(); + ArrayList<String> types = new ArrayList<>(); readHeader(br); parseAttributeStatements(br, names, types); @@ -205,7 +209,7 @@ public class ArffParser implements Parser { private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException { // logger.warning("Sparse instance."); - TIntObjectHashMap<Object> map = new TIntObjectHashMap<Object>(); + TIntObjectHashMap<Object> map = new TIntObjectHashMap<>(); while(true) { nextToken(tokenizer); assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL); @@ -216,19 +220,21 @@ public class ArffParser implements Parser { } else { // sparse token - if(tokenizer.ttype != StreamTokenizer.TT_NUMBER) { - throw new AbortException("Unexpected token type encountered: " + tokenizer.toString()); + if(tokenizer.ttype != StreamTokenizer.TT_WORD) { + throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype); } - int dim = (int) tokenizer.nval; + int dim = Integer.valueOf(tokenizer.sval); if(map.containsKey(dim)) { throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString()); } nextToken(tokenizer); - if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) { - map.put(dim, Double.valueOf(tokenizer.nval)); - } - else if(tokenizer.ttype == StreamTokenizer.TT_WORD) { - map.put(dim, tokenizer.sval); + if(tokenizer.ttype == StreamTokenizer.TT_WORD) { + if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]])) { + map.put(dim, AbstractParser.parseDouble(tokenizer.sval)); + } + else { + map.put(dim, tokenizer.sval); + } } else { throw new AbortException("Unexpected token type encountered: " + tokenizer.toString()); @@ -247,7 +253,7 @@ public class ArffParser implements Parser { } assert (s >= 0); if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) { - TIntFloatHashMap f = new TIntFloatHashMap(dimsize[out]); + TIntDoubleHashMap f = new TIntDoubleHashMap(dimsize[out]); for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) { iter.advance(); int i = iter.key(); @@ -258,9 +264,9 @@ public class ArffParser implements Parser { break; } double v = ((Double) iter.value()).doubleValue(); - f.put(i - s + 1, (float) v); + f.put(i - s, v); } - data[out] = new SparseFloatVector(f, dimsize[out]); + data[out] = new SparseDoubleVector(f, dimsize[out]); } else if(TypeUtil.LABELLIST.equals(elkitypes[out])) { // Build a label list out of successive labels @@ -292,10 +298,10 @@ public class ArffParser implements Parser { } } else if(TypeUtil.CLASSLABEL.equals(elkitypes[out])) { - String val = (String) map.get(s); + Object val = map.get(s); if(val != null) { // TODO: support other class label types. - ClassLabel lbl = new SimpleClassLabel(val); + ClassLabel lbl = new SimpleClassLabel(String.valueOf(val)); data[out] = lbl; } else { @@ -321,7 +327,7 @@ public class ArffParser implements Parser { } else if(tokenizer.ttype == StreamTokenizer.TT_WORD) { try { - cur[k] = Double.parseDouble(tokenizer.sval); + cur[k] = AbstractParser.parseDouble(tokenizer.sval); } catch(NumberFormatException e) { throw new AbortException("Expected number value, got: " + tokenizer.sval); @@ -381,7 +387,7 @@ public class ArffParser implements Parser { { tokenizer.resetSyntax(); tokenizer.whitespaceChars(0, ' '); - tokenizer.ordinaryChars('0', '9'); + tokenizer.ordinaryChars('0', '9'); // Do not parse numbers tokenizer.ordinaryChar('-'); tokenizer.ordinaryChar('.'); tokenizer.wordChars(' ' + 1, '\u00FF'); @@ -421,12 +427,12 @@ public class ArffParser implements Parser { labels[i] = names.get(out + i); } if(!sparse) { - VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.FACTORY, dimsize[out], labels); + VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels); bundle.appendColumn(type, new ArrayList<DoubleVector>()); } else { - VectorFieldTypeInformation<SparseFloatVector> type = new VectorFieldTypeInformation<SparseFloatVector>(SparseFloatVector.FACTORY, dimsize[out], labels); - bundle.appendColumn(type, new ArrayList<SparseFloatVector>()); + VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels); + bundle.appendColumn(type, new ArrayList<SparseDoubleVector>()); } } else if(TypeUtil.LABELLIST.equals(etyp[out])) { @@ -434,13 +440,13 @@ public class ArffParser implements Parser { for(int i = 1; i < dimsize[out]; i++) { label.append(' ').append(names.get(out + i)); } - bundle.appendColumn(new SimpleTypeInformation<LabelList>(LabelList.class, label.toString()), new ArrayList<LabelList>()); + bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>()); } else if(TypeUtil.EXTERNALID.equals(etyp[out])) { - bundle.appendColumn(new SimpleTypeInformation<ExternalID>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>()); + bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>()); } else if(TypeUtil.CLASSLABEL.equals(etyp[out])) { - bundle.appendColumn(new SimpleTypeInformation<ClassLabel>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>()); + bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>()); } else { throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null")); diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java index 32a26d7d..07019040 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/BitVectorLabelParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -63,11 +63,12 @@ public class BitVectorLabelParser extends AbstractParser implements Parser { /** * Constructor. * - * @param colSep - * @param quoteChar + * @param colSep Column separator + * @param quoteChar Quotation character + * @param comment Comment pattern */ - public BitVectorLabelParser(Pattern colSep, char quoteChar) { - super(colSep, quoteChar); + public BitVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment) { + super(colSep, quoteChar, comment); } @Override @@ -75,48 +76,47 @@ public class BitVectorLabelParser extends AbstractParser implements Parser { BufferedReader reader = new BufferedReader(new InputStreamReader(in)); int lineNumber = 0; int dimensionality = -1; - List<BitVector> vectors = new ArrayList<BitVector>(); - List<LabelList> labels = new ArrayList<LabelList>(); + List<BitVector> vectors = new ArrayList<>(); + List<LabelList> labels = new ArrayList<>(); try { - for(String line; (line = reader.readLine()) != null; lineNumber++) { - if(!line.startsWith(COMMENT) && line.length() > 0) { - List<String> entries = tokenize(line); - // FIXME: use more efficient storage right away? - List<Bit> attributes = new ArrayList<Bit>(); - LabelList ll = null; - for(String entry : entries) { - try { - Bit attribute = Bit.valueOf(entry); - attributes.add(attribute); - } - catch(NumberFormatException e) { - if(ll == null) { - ll = new LabelList(1); - } - ll.add(entry); + for (String line; (line = reader.readLine()) != null; lineNumber++) { + // Skip empty lines and comments + if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { + continue; + } + List<String> entries = tokenize(line); + // FIXME: use more efficient storage right away? + List<Bit> attributes = new ArrayList<>(); + LabelList ll = null; + for (String entry : entries) { + try { + Bit attribute = Bit.valueOf(entry); + attributes.add(attribute); + } catch (NumberFormatException e) { + if (ll == null) { + ll = new LabelList(1); } + ll.add(entry); } + } - if(dimensionality < 0) { - dimensionality = attributes.size(); - } - else if(dimensionality != attributes.size()) { - throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + "."); - } - - vectors.add(new BitVector(attributes.toArray(new Bit[attributes.size()]))); - labels.add(ll); + if (dimensionality < 0) { + dimensionality = attributes.size(); + } else if (dimensionality != attributes.size()) { + throw new IllegalArgumentException("Differing dimensionality in line " + lineNumber + "."); } + + vectors.add(new BitVector(attributes.toArray(new Bit[attributes.size()]))); + labels.add(ll); } - } - catch(IOException e) { + } catch (IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, labels); } protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) { - return new VectorFieldTypeInformation<BitVector>(BitVector.FACTORY, dimensionality); + return new VectorFieldTypeInformation<>(BitVector.FACTORY, dimensionality); } @Override @@ -134,7 +134,7 @@ public class BitVectorLabelParser extends AbstractParser implements Parser { public static class Parameterizer extends AbstractParser.Parameterizer { @Override protected BitVectorLabelParser makeInstance() { - return new BitVectorLabelParser(colSep, quoteChar); + return new BitVectorLabelParser(colSep, quoteChar, comment); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java index 0c291fb4..b95dce74 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/DoubleVectorLabelParser.java @@ -49,7 +49,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz *
* @apiviz.has DoubleVector
*
- * @deprecated Use NumberVectorLabelParser instead, which defaults to DoubleVector.
+ * @deprecated Use NumberVectorLabelParser instead, which defaults to
+ * DoubleVector.
*/
@Deprecated
public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVector> {
@@ -61,19 +62,20 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto /**
* Constructor.
*
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public DoubleVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, DoubleVector.FACTORY);
+ public DoubleVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices) {
+ super(colSep, quoteChar, comment, labelIndices, DoubleVector.FACTORY);
}
/**
* Constructor with default values.
*/
public DoubleVectorLabelParser() {
- this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, new BitSet());
+ this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, Pattern.compile(COMMENT_PATTERN), new BitSet());
}
@Override
@@ -96,7 +98,7 @@ public class DoubleVectorLabelParser extends NumberVectorLabelParser<DoubleVecto @Override
protected DoubleVectorLabelParser makeInstance() {
- return new DoubleVectorLabelParser(colSep, quoteChar, labelIndices);
+ return new DoubleVectorLabelParser(colSep, quoteChar, comment, labelIndices);
}
}
-}
\ No newline at end of file +}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java index 6288da8e..71b65cfc 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/FloatVectorLabelParser.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -57,19 +57,20 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz @Deprecated
public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector> {
/**
- * Class logger
+ * Class logger.
*/
private static final Logging LOG = Logging.getLogger(FloatVectorLabelParser.class);
/**
* Constructor.
*
- * @param colSep
- * @param quoteChar
- * @param labelIndices
+ * @param colSep Column separator
+ * @param quoteChar Quotation character
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public FloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, FloatVector.FACTORY);
+ public FloatVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices) {
+ super(colSep, quoteChar, comment, labelIndices, FloatVector.FACTORY);
}
@Override
@@ -92,7 +93,7 @@ public class FloatVectorLabelParser extends NumberVectorLabelParser<FloatVector> @Override
protected FloatVectorLabelParser makeInstance() {
- return new FloatVectorLabelParser(colSep, quoteChar, labelIndices);
+ return new FloatVectorLabelParser(colSep, quoteChar, comment, labelIndices);
}
}
}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java index ea44c072..39da752b 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/NumberVectorLabelParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -40,6 +40,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; @@ -97,16 +98,6 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract public static final OptionID VECTOR_TYPE_ID = new OptionID("parser.vector-type", "The type of vectors to create for numerical attributes."); /** - * Constant used for unknown dimensionality (e.g. empty files) - */ - public static final int DIMENSIONALITY_UNKNOWN = -1; - - /** - * Constant used for records of variable dimensionality (e.g. time series) - */ - public static final int DIMENSIONALITY_VARIABLE = -2; - - /** * Keeps the indices of the attributes to be treated as a string label. */ protected BitSet labelIndices; @@ -129,7 +120,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract /** * Dimensionality reported. */ - protected int dimensionality; + protected int mindim, maxdim; /** * Metadata. @@ -167,7 +158,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @param factory Vector factory */ public NumberVectorLabelParser(NumberVector.Factory<V, ?> factory) { - this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, null, factory); + this(Pattern.compile(DEFAULT_SEPARATOR), QUOTE_CHAR, Pattern.compile(COMMENT_PATTERN), null, factory); } /** @@ -175,11 +166,12 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * * @param colSep Column separator * @param quoteChar Quote character + * @param comment Comment pattern * @param labelIndices Column indexes that are numeric. * @param factory Vector factory */ - public NumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, NumberVector.Factory<V, ?> factory) { - super(colSep, quoteChar); + public NumberVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, NumberVector.Factory<V, ?> factory) { + super(colSep, quoteChar, comment); this.labelIndices = labelIndices; this.factory = factory; } @@ -188,7 +180,8 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract public void initStream(InputStream in) { reader = new BufferedReader(new InputStreamReader(in)); lineNumber = 1; - dimensionality = DIMENSIONALITY_UNKNOWN; + mindim = Integer.MAX_VALUE; + maxdim = 0; columnnames = null; labelcolumns = new BitSet(); if (labelIndices != null) { @@ -210,31 +203,34 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract } try { for (String line; (line = reader.readLine()) != null; lineNumber++) { - if (!line.startsWith(COMMENT) && line.length() > 0) { - parseLineInternal(line); - // Maybe a header column? - if (curvec == null) { - continue; - } - if (dimensionality == DIMENSIONALITY_UNKNOWN) { - dimensionality = curvec.getDimensionality(); - buildMeta(); - nextevent = Event.NEXT_OBJECT; - return Event.META_CHANGED; - } else if (dimensionality > 0) { - if (dimensionality != curvec.getDimensionality()) { - dimensionality = DIMENSIONALITY_VARIABLE; - buildMeta(); - nextevent = Event.NEXT_OBJECT; - return Event.META_CHANGED; - } - } else if (curlbl != null && meta != null && meta.size() == 1) { - buildMeta(); - nextevent = Event.NEXT_OBJECT; - return Event.META_CHANGED; - } - return Event.NEXT_OBJECT; + // Skip empty lines and comments + if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { + continue; + } + parseLineInternal(line); + // Maybe a header column? + if (curvec == null) { + continue; } + final int curdim = curvec.getDimensionality(); + if (maxdim < mindim) { + mindim = curdim; + maxdim = curdim; + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; + } else if (mindim < curdim || maxdim > curdim) { + mindim = Math.min(mindim, curdim); + maxdim = Math.max(maxdim, curdim); + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; + } else if (curlbl != null && meta != null && meta.size() == 1) { + buildMeta(); + nextevent = Event.NEXT_OBJECT; + return Event.META_CHANGED; + } + return Event.NEXT_OBJECT; } reader.close(); reader = null; @@ -250,11 +246,11 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract protected void buildMeta() { if (labelcolumns.cardinality() > 0 || (labelIndices != null && labelIndices.cardinality() > 0)) { meta = new BundleMeta(2); - meta.add(getTypeInformation(dimensionality)); + meta.add(getTypeInformation(mindim, maxdim)); meta.add(TypeUtil.LABELLIST); } else { meta = new BundleMeta(1); - meta.add(getTypeInformation(dimensionality)); + meta.add(getTypeInformation(mindim, maxdim)); } } @@ -287,7 +283,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract String ent = itr.next(); if (labelIndices == null || !labelIndices.get(i)) { try { - double attribute = Double.parseDouble(ent); + double attribute = parseDouble(ent); attributes.add(attribute); continue; } catch (NumberFormatException e) { @@ -333,15 +329,16 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract /** * Get a prototype object for the given dimensionality. * - * @param dimensionality Dimensionality + * @param mindim Minimum dimensionality + * @param maxdim Maximum dimensionality * @return Prototype object */ - SimpleTypeInformation<V> getTypeInformation(int dimensionality) { - if (dimensionality > 0) { + SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) { + if (mindim == maxdim) { String[] colnames = null; if (columnnames != null) { - if (columnnames.size() - labelcolumns.cardinality() == dimensionality) { - colnames = new String[dimensionality]; + if (columnnames.size() - labelcolumns.cardinality() == mindim) { + colnames = new String[mindim]; for (int i = 0, j = 0; i < columnnames.size(); i++) { if (!labelcolumns.get(i)) { colnames[j] = columnnames.get(i); @@ -350,13 +347,13 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract } } } - return new VectorFieldTypeInformation<V>(factory, dimensionality, colnames); - } - // Variable dimensionality - return non-vector field type - if (dimensionality == DIMENSIONALITY_VARIABLE) { - return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer()); + return new VectorFieldTypeInformation<>(factory, mindim, colnames); + } else if (mindim < maxdim) { + // Variable dimensionality - return non-vector field type + return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim); + } else { + throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); } - throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); } @Override @@ -395,7 +392,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract * @param config Parameterization */ protected void getFactory(Parameterization config) { - ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<NumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class); + ObjectParameter<NumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, NumberVector.Factory.class, DoubleVector.Factory.class); if (config.grab(factoryP)) { factory = factoryP.instantiateClass(config); } @@ -420,7 +417,7 @@ public class NumberVectorLabelParser<V extends NumberVector<?>> extends Abstract @Override protected NumberVectorLabelParser<V> makeInstance() { - return new NumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, factory); + return new NumberVectorLabelParser<>(colSep, quoteChar, comment, labelIndices, factory); } } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java index 5b511a92..a0b4e573 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/Parser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.datasource.parser; import java.io.InputStream; import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; -import de.lmu.ifi.dbs.elki.utilities.InspectionUtilFrequentlyScanned; import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable; /** @@ -38,7 +37,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable; * @apiviz.uses InputStream * @apiviz.has MultipleObjectsBundle oneway - - «create» */ -public interface Parser extends Parameterizable, InspectionUtilFrequentlyScanned { +public interface Parser extends Parameterizable { /** * Returns a list of the objects parsed from the specified input stream. * diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java index ce366b9e..a3d46ed8 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SimplePolygonParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -77,11 +77,12 @@ public class SimplePolygonParser extends AbstractParser implements Parser { /** * Constructor. * - * @param colSep - * @param quoteChar + * @param colSep Column separator + * @param quoteChar Quotation character + * @param comment Comment pattern */ - public SimplePolygonParser(Pattern colSep, char quoteChar) { - super(colSep, quoteChar); + public SimplePolygonParser(Pattern colSep, char quoteChar, Pattern comment) { + super(colSep, quoteChar, comment); } @Override @@ -89,35 +90,35 @@ public class SimplePolygonParser extends AbstractParser implements Parser { BufferedReader reader = new BufferedReader(new InputStreamReader(in)); int lineNumber = 1; - List<PolygonsObject> polys = new ArrayList<PolygonsObject>(); + List<PolygonsObject> polys = new ArrayList<>(); List<LabelList> labels = null; - List<ExternalID> eids = new ArrayList<ExternalID>(); + List<ExternalID> eids = new ArrayList<>(); try { - for(String line; (line = reader.readLine()) != null; lineNumber++) { - if(!line.startsWith(COMMENT) && line.length() > 0) { - Object[] objs = parseLine(line); - polys.add((PolygonsObject) objs[0]); - if(objs[1] != null) { - if(labels == null) { - labels = new ArrayList<LabelList>(); - for(int i = 0; i < polys.size() - 1; i++) { - labels.add(null); - } + for (String line; (line = reader.readLine()) != null; lineNumber++) { + // Skip empty lines and comments + if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { + continue; + } + Object[] objs = parseLine(line); + polys.add((PolygonsObject) objs[0]); + if (objs[1] != null) { + if (labels == null) { + labels = new ArrayList<>(); + for (int i = 0; i < polys.size() - 1; i++) { + labels.add(null); } - labels.add((LabelList) objs[1]); } - eids.add((ExternalID) objs[2]); + labels.add((LabelList) objs[1]); } + eids.add((ExternalID) objs[2]); } - } - catch(IOException e) { + } catch (IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } - if(labels != null) { + if (labels != null) { return MultipleObjectsBundle.makeSimple(TypeUtil.POLYGON_TYPE, polys, TypeUtil.LABELLIST, labels, TypeUtil.EXTERNALID, eids); - } - else { + } else { return MultipleObjectsBundle.makeSimple(TypeUtil.POLYGON_TYPE, polys, TypeUtil.EXTERNALID, eids); } } @@ -135,51 +136,48 @@ public class SimplePolygonParser extends AbstractParser implements Parser { ExternalID eid = null; LabelList labels = null; - List<Polygon> polys = new ArrayList<Polygon>(1); + List<Polygon> polys = new ArrayList<>(1); - List<Vector> coords = new ArrayList<Vector>(); - while(iter.hasNext()) { + List<Vector> coords = new ArrayList<>(); + while (iter.hasNext()) { String cur = iter.next(); Matcher m = COORD.matcher(cur); - if(m.find()) { + if (m.find()) { try { double c1 = Double.parseDouble(m.group(1)); double c2 = Double.parseDouble(m.group(2)); - if(m.group(3) != null) { + if (m.group(3) != null) { double c3 = Double.parseDouble(m.group(3)); coords.add(new Vector(new double[] { c1, c2, c3 })); - } - else { + } else { coords.add(new Vector(new double[] { c1, c2 })); } continue; - } - catch(NumberFormatException e) { + } catch (NumberFormatException e) { LOG.warning("Looked like a coordinate pair but didn't parse: " + cur); } } // Polygon separator. - if(cur.equals(POLYGON_SEPARATOR)) { - if(coords.size() > 0) { + if (cur.equals(POLYGON_SEPARATOR)) { + if (coords.size() > 0) { polys.add(new Polygon(coords)); - coords = new ArrayList<Vector>(); + coords = new ArrayList<>(); } continue; } // First label will become the External ID - if(eid == null) { + if (eid == null) { eid = new ExternalID(cur); - } - else { + } else { // Label - if(labels == null) { + if (labels == null) { labels = new LabelList(1); } labels.add(cur); } } // Complete polygon - if(coords.size() > 0) { + if (coords.size() > 0) { polys.add(new Polygon(coords)); } return new Object[] { new PolygonsObject(polys), labels, eid }; @@ -201,19 +199,24 @@ public class SimplePolygonParser extends AbstractParser implements Parser { @Override protected void makeOptions(Parameterization config) { PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, "\\s+"); - if(config.grab(colParam)) { + if (config.grab(colParam)) { colSep = colParam.getValue(); } StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR)); quoteParam.addConstraint(new StringLengthConstraint(1, 1)); - if(config.grab(quoteParam)) { + if (config.grab(quoteParam)) { quoteChar = quoteParam.getValue().charAt(0); } + + PatternParameter commentP = new PatternParameter(COMMENT_ID, COMMENT_PATTERN); + if (config.grab(commentP)) { + comment = commentP.getValue(); + } } @Override protected SimplePolygonParser makeInstance() { - return new SimplePolygonParser(colSep, quoteChar); + return new SimplePolygonParser(colSep, quoteChar, comment); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java index 35e53bb7..5f9e5e05 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseBitVectorLabelParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -64,11 +64,12 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser /** * Constructor. * - * @param colSep - * @param quoteChar + * @param colSep Column separator + * @param quoteChar Quotation character + * @param comment Comment pattern */ - public SparseBitVectorLabelParser(Pattern colSep, char quoteChar) { - super(colSep, quoteChar); + public SparseBitVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment) { + super(colSep, quoteChar, comment); } @Override @@ -76,54 +77,54 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser BufferedReader reader = new BufferedReader(new InputStreamReader(in)); int lineNumber = 0; int dimensionality = -1; - List<BitVector> vectors = new ArrayList<BitVector>(); - List<LabelList> lblc = new ArrayList<LabelList>(); + List<BitVector> vectors = new ArrayList<>(); + List<LabelList> lblc = new ArrayList<>(); try { - List<BitSet> bitSets = new ArrayList<BitSet>(); - List<LabelList> allLabels = new ArrayList<LabelList>(); - for(String line; (line = reader.readLine()) != null; lineNumber++) { - if(!line.startsWith(COMMENT) && line.length() > 0) { - List<String> entries = tokenize(line); - BitSet bitSet = new BitSet(); - LabelList labels = null; - - for(String entry : entries) { - try { - int index = Integer.parseInt(entry); - bitSet.set(index); - dimensionality = Math.max(dimensionality, index); - } - catch(NumberFormatException e) { - if(labels == null) { - labels = new LabelList(1); - } - labels.add(entry); + List<BitSet> bitSets = new ArrayList<>(); + List<LabelList> allLabels = new ArrayList<>(); + for (String line; (line = reader.readLine()) != null; lineNumber++) { + // Skip empty lines and comments + if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { + continue; + } + List<String> entries = tokenize(line); + BitSet bitSet = new BitSet(); + LabelList labels = null; + + for (String entry : entries) { + try { + int index = Integer.parseInt(entry); + bitSet.set(index); + dimensionality = Math.max(dimensionality, index); + } catch (NumberFormatException e) { + if (labels == null) { + labels = new LabelList(1); } + labels.add(entry); } - - bitSets.add(bitSet); - allLabels.add(labels); } + + bitSets.add(bitSet); + allLabels.add(labels); } dimensionality++; - for(int i = 0; i < bitSets.size(); i++) { + for (int i = 0; i < bitSets.size(); i++) { BitSet bitSet = bitSets.get(i); LabelList labels = allLabels.get(i); vectors.add(new BitVector(bitSet, dimensionality)); lblc.add(labels); } - } - catch(IOException e) { + } catch (IOException e) { throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); } return MultipleObjectsBundle.makeSimple(getTypeInformation(dimensionality), vectors, TypeUtil.LABELLIST, lblc); } protected VectorFieldTypeInformation<BitVector> getTypeInformation(int dimensionality) { - return new VectorFieldTypeInformation<BitVector>(BitVector.FACTORY, dimensionality); + return new VectorFieldTypeInformation<>(BitVector.FACTORY, dimensionality); } - + @Override protected Logging getLogger() { return LOG; @@ -139,7 +140,7 @@ public class SparseBitVectorLabelParser extends AbstractParser implements Parser public static class Parameterizer extends AbstractParser.Parameterizer { @Override protected SparseBitVectorLabelParser makeInstance() { - return new SparseBitVectorLabelParser(colSep, quoteChar); + return new SparseBitVectorLabelParser(colSep, quoteChar, comment); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java index 9f658b0a..d5fe6219 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseFloatVectorLabelParser.java @@ -75,10 +75,11 @@ public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser< *
* @param colSep Column separator
* @param quoteChar Quotation character
- * @param labelIndices Label indexes
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
*/
- public SparseFloatVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices) {
- super(colSep, quoteChar, labelIndices, SparseFloatVector.FACTORY);
+ public SparseFloatVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices) {
+ super(colSep, quoteChar, comment, labelIndices, SparseFloatVector.FACTORY);
}
/**
@@ -91,7 +92,7 @@ public class SparseFloatVectorLabelParser extends SparseNumberVectorLabelParser< public static class Parameterizer extends SparseNumberVectorLabelParser.Parameterizer<SparseFloatVector> {
@Override
protected SparseFloatVectorLabelParser makeInstance() {
- return new SparseFloatVectorLabelParser(colSep, quoteChar, labelIndices);
+ return new SparseFloatVectorLabelParser(colSep, quoteChar, comment, labelIndices);
}
}
}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java index f4ec8c59..bdd8ab77 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/SparseNumberVectorLabelParser.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.SparseFloatVector; import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -86,12 +87,6 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte private static final Logging LOG = Logging.getLogger(SparseNumberVectorLabelParser.class);
/**
- * Holds the dimensionality of the parsed data which is the maximum occurring
- * index of any attribute.
- */
- private int maxdim = -1;
-
- /**
* Same as {@link #factory}, but subtype.
*/
private SparseNumberVector.Factory<V, ?> sparsefactory;
@@ -101,11 +96,12 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte *
* @param colSep Column separator
* @param quoteChar Quotation character
- * @param labelIndices Label indexes
+ * @param comment Comment pattern
+ * @param labelIndices Indices to use as labels
* @param factory Vector factory
*/
- public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
- super(colSep, quoteChar, labelIndices, factory);
+ public SparseNumberVectorLabelParser(Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) {
+ super(colSep, quoteChar, comment, labelIndices, factory);
this.sparsefactory = factory;
}
@@ -116,7 +112,8 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte TIntDoubleHashMap values = new TIntDoubleHashMap(cardinality, 1);
LabelList labels = null;
-
+ int thismax = 0;
+
for (int i = 1; i < entries.size() - 1; i++) {
if (labelIndices == null || !labelIndices.get(i)) {
try {
@@ -124,7 +121,8 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte if (index >= maxdim) {
maxdim = index + 1;
}
- double attribute = Double.parseDouble(entries.get(i));
+ thismax = Math.max(thismax, index);
+ double attribute = parseDouble(entries.get(i));
values.put(index, attribute);
i++;
} catch (NumberFormatException e) {
@@ -144,17 +142,19 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte if (values.size() > maxdim) {
throw new AbortException("Invalid sparse vector seen: " + line);
}
+ if (thismax < mindim) {
+ mindim = thismax;
+ }
curvec = sparsefactory.newNumberVector(values, maxdim);
curlbl = labels;
}
@Override
- protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) {
- if (dimensionality > 0) {
- return new VectorFieldTypeInformation<V>(factory, dimensionality);
- }
- if (dimensionality == DIMENSIONALITY_VARIABLE) {
- return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer());
+ protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
+ if (mindim == maxdim) {
+ return new VectorFieldTypeInformation<>(factory, mindim);
+ } else if (mindim < maxdim) {
+ return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim);
}
throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
}
@@ -174,7 +174,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte public static class Parameterizer<V extends SparseNumberVector<?>> extends NumberVectorLabelParser.Parameterizer<V> {
@Override
protected void getFactory(Parameterization config) {
- ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<SparseNumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
+ ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
if (config.grab(factoryP)) {
factory = factoryP.instantiateClass(config);
}
@@ -182,7 +182,7 @@ public class SparseNumberVectorLabelParser<V extends SparseNumberVector<?>> exte @Override
protected SparseNumberVectorLabelParser<V> makeInstance() {
- return new SparseNumberVectorLabelParser<V>(colSep, quoteChar, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
+ return new SparseNumberVectorLabelParser<>(colSep, quoteChar, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java index 01579dc6..73d38e3c 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/StreamingParser.java @@ -8,7 +8,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java new file mode 100644 index 00000000..41f21c5d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/StringParser.java @@ -0,0 +1,146 @@ +package de.lmu.ifi.dbs.elki.datasource.parser; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import de.lmu.ifi.dbs.elki.data.LabelList; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; + +/** + * Parser that loads a text file for use with string similarity measures. + * + * The parser produces two relations: the first of type String, the second of + * type label list, which contains the same data for convenience. + * + * @author Felix Stahlberg + * @author Erich Schubert + */ +@Title("String Parser") +@Description("Parses new line separated strings") +public class StringParser implements Parser { + /** + * Comment pattern. + */ + Pattern comment; + + /** + * Flag to trim whitespace. + */ + boolean trimWhitespace; + + /** + * Constructor. + * + * @param comment Pattern for comments. + * @param trimWhitespace Trim leading and trailing whitespace. + */ + public StringParser(Pattern comment, boolean trimWhitespace) { + super(); + this.comment = comment; + this.trimWhitespace = trimWhitespace; + } + + @Override + public MultipleObjectsBundle parse(InputStream in) { + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + int lineNumber = 0; + List<String> data = new ArrayList<>(); + List<LabelList> labels = new ArrayList<>(); + try { + for (String line; (line = reader.readLine()) != null; lineNumber++) { + // Skip empty lines and comments + if (line.length() <= 0 || (comment != null && comment.matcher(line).matches())) { + continue; + } + final String val = trimWhitespace ? line.trim() : line; + data.add(val); + LabelList ll = new LabelList(1); + ll.add(val); + labels.add(ll); + } + } catch (IOException e) { + throw new IllegalArgumentException("Error while parsing line " + lineNumber + "."); + } + return MultipleObjectsBundle.makeSimple(TypeUtil.STRING, data, TypeUtil.LABELLIST, labels); + } + + /** + * Parameterization class. + * + * @author Felix Stahlberg + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Flag to trim whitespace. + */ + public static final OptionID TRIM_ID = new OptionID("string.trim", "Remove leading and trailing whitespace from each line."); + + /** + * Comment pattern. + */ + Pattern comment = null; + + /** + * Flag to trim whitespace. + */ + boolean trimWhitespace = false; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + PatternParameter commentP = new PatternParameter(AbstractParser.Parameterizer.COMMENT_ID, "^\\s*#.*$"); + if (config.grab(commentP)) { + comment = commentP.getValue(); + } + + Flag trimP = new Flag(TRIM_ID); + if (config.grab(trimP)) { + trimWhitespace = trimP.isTrue(); + } + } + + @Override + protected StringParser makeInstance() { + return new StringParser(comment, trimWhitespace); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java index 2ea6ebb5..580c5320 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/TermFrequencyParser.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.datasource.parser; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,10 +24,11 @@ package de.lmu.ifi.dbs.elki.datasource.parser; */ import gnu.trove.iterator.TIntDoubleIterator; +import gnu.trove.map.TObjectIntMap; import gnu.trove.map.hash.TIntDoubleHashMap; +import gnu.trove.map.hash.TObjectIntHashMap; import java.util.BitSet; -import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; @@ -36,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.SparseFloatVector; import de.lmu.ifi.dbs.elki.data.SparseNumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -62,14 +64,14 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class); /** - * Maximum dimension used. + * Number of different terms observed. */ - int maxdim; + int numterms; /** * Map. */ - HashMap<String, Integer> keymap; + TObjectIntMap<String> keymap; /** * Normalize. @@ -85,15 +87,15 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number * Constructor. * * @param normalize Normalize - * @param colSep - * @param quoteChar - * @param labelIndices + * @param colSep Column separator + * @param quoteChar Quotation character + * @param comment Comment pattern + * @param labelIndices Indices to use as labels */ - public TermFrequencyParser(boolean normalize, Pattern colSep, char quoteChar, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) { - super(colSep, quoteChar, labelIndices, factory); + public TermFrequencyParser(boolean normalize, Pattern colSep, char quoteChar, Pattern comment, BitSet labelIndices, SparseNumberVector.Factory<V, ?> factory) { + super(colSep, quoteChar, comment, labelIndices, factory); this.normalize = normalize; - this.maxdim = 0; - this.keymap = new HashMap<String, Integer>(); + this.keymap = new TObjectIntHashMap<>(1001, .5f, -1); this.sparsefactory = factory; } @@ -111,12 +113,12 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number curterm = entries.get(i); } else { try { - double attribute = Double.parseDouble(entries.get(i)); - Integer curdim = keymap.get(curterm); - if (curdim == null) { - curdim = Integer.valueOf(maxdim + 1); + double attribute = parseDouble(entries.get(i)); + int curdim = keymap.get(curterm); + if (curdim < 0) { + curdim = numterms; keymap.put(curterm, curdim); - maxdim += 1; + ++numterms; } values.put(curdim, attribute); len += attribute; @@ -147,17 +149,16 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number } } - curvec = sparsefactory.newNumberVector(values, maxdim); + curvec = sparsefactory.newNumberVector(values, numterms); curlbl = labels; } @Override - protected SimpleTypeInformation<V> getTypeInformation(int dimensionality) { - if (dimensionality > 0) { - return new VectorFieldTypeInformation<V>(factory, dimensionality); - } - if (dimensionality == DIMENSIONALITY_VARIABLE) { - return new SimpleTypeInformation<V>(factory.getRestrictionClass(), factory.getDefaultSerializer()); + protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) { + if (mindim == maxdim) { + return new VectorFieldTypeInformation<>(factory, mindim); + } else if (mindim < maxdim) { + return new VectorTypeInformation<>(factory.getRestrictionClass(), factory.getDefaultSerializer(), mindim, maxdim); } throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); } @@ -196,7 +197,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number @Override protected void getFactory(Parameterization config) { - ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<SparseNumberVector.Factory<V, ?>>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class); + ObjectParameter<SparseNumberVector.Factory<V, ?>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class); if (config.grab(factoryP)) { factory = factoryP.instantiateClass(config); } @@ -204,7 +205,7 @@ public class TermFrequencyParser<V extends SparseNumberVector<?>> extends Number @Override protected TermFrequencyParser<V> makeInstance() { - return new TermFrequencyParser<V>(normalize, colSep, quoteChar, labelIndices, (SparseNumberVector.Factory<V, ?>) factory); + return new TermFrequencyParser<>(normalize, colSep, quoteChar, comment, labelIndices, (SparseNumberVector.Factory<V, ?>) factory); } } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java index 58ae9a77..c21ab31f 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/package-info.java @@ -42,7 +42,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |