summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java688
1 files changed, 688 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
new file mode 100644
index 00000000..515c68c4
--- /dev/null
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/parser/ArffParser.java
@@ -0,0 +1,688 @@
+package de.lmu.ifi.dbs.elki.datasource.parser;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2015
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.iterator.TIntObjectIterator;
+import gnu.trove.map.hash.TIntDoubleHashMap;
+import gnu.trove.map.hash.TIntObjectHashMap;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.StreamTokenizer;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.data.ClassLabel;
+import de.lmu.ifi.dbs.elki.data.DoubleVector;
+import de.lmu.ifi.dbs.elki.data.ExternalID;
+import de.lmu.ifi.dbs.elki.data.LabelList;
+import de.lmu.ifi.dbs.elki.data.SimpleClassLabel;
+import de.lmu.ifi.dbs.elki.data.SparseDoubleVector;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
+
+/**
+ * Parser to load WEKA .arff files into ELKI.
+ *
+ * This parser is quite hackish, and contains lots of not yet configurable
+ * magic.
+ *
+ * TODO: Allow configuration of the vector types (double, float)
+ *
+ * TODO: when encountering integer columns, produce integer vectors.
+ *
+ * TODO: allow optional class labels.
+ *
+ * @author Erich Schubert
+ */
+public class ArffParser implements Parser {
+ /**
+ * Logger.
+ */
+ private static final Logging LOG = Logging.getLogger(ArffParser.class);
+
+ /**
+ * Arff file marker.
+ */
+ public static final Matcher ARFF_HEADER_RELATION = Pattern.compile("@relation\\s+(.*)", Pattern.CASE_INSENSITIVE).matcher("");
+
+ /**
+ * Arff attribute declaration marker.
+ */
+ public static final Matcher ARFF_HEADER_ATTRIBUTE = Pattern.compile("@attribute\\s+([^ ]+|['\"].*?['\"])\\s+(numeric|real|integer|string|double|date(\\s.*)|\\{.*\\})\\s*", Pattern.CASE_INSENSITIVE).matcher("");
+
+ /**
+ * Arff data marker.
+ */
+ public static final Matcher ARFF_HEADER_DATA = Pattern.compile("@data\\s*", Pattern.CASE_INSENSITIVE).matcher("");
+
+ /**
+ * Comment pattern.
+ */
+ public static final Matcher ARFF_COMMENT = Pattern.compile("^\\s*%.*").matcher("");
+
+ /**
+ * Pattern to auto-convert columns to external ids.
+ */
+ public static final String DEFAULT_ARFF_MAGIC_EID = "(External-?ID)";
+
+ /**
+ * Pattern to auto-convert columns to class labels.
+ */
+ public static final String DEFAULT_ARFF_MAGIC_CLASS = "(Class|Class-?Label)";
+
+ /**
+ * Pattern for numeric columns.
+ */
+ public static final Matcher ARFF_NUMERIC = Pattern.compile("(numeric|real|integer|double)", Pattern.CASE_INSENSITIVE).matcher("");
+
+ /**
+ * Empty line pattern.
+ */
+ public static final Matcher EMPTY = Pattern.compile("^\\s*$").matcher("");
+
+ /**
+ * Pattern to recognize external ids.
+ */
+ Matcher magic_eid;
+
+ /**
+ * Pattern to recognize class label columns.
+ */
+ Matcher magic_class;
+
+ /**
+ * (Reused) buffer for building label lists.
+ */
+ ArrayList<String> labels = new ArrayList<>();
+
+ /**
+ * Constructor.
+ *
+ * @param magic_eid Magic to recognize external IDs
+ * @param magic_class Magic to recognize class labels
+ */
+ public ArffParser(Pattern magic_eid, Pattern magic_class) {
+ super();
+ this.magic_eid = magic_eid.matcher("");
+ this.magic_class = magic_class.matcher("");
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param magic_eid Magic to recognize external IDs
+ * @param magic_class Magic to recognize class labels
+ */
+ public ArffParser(String magic_eid, String magic_class) {
+ this(Pattern.compile(magic_eid, Pattern.CASE_INSENSITIVE), Pattern.compile(magic_class, Pattern.CASE_INSENSITIVE));
+ }
+
+ @Override
+ public MultipleObjectsBundle parse(InputStream instream) {
+ try {
+ BufferedReader br = new BufferedReader(new InputStreamReader(instream));
+ ArrayList<String> names = new ArrayList<>();
+ ArrayList<String> types = new ArrayList<>();
+
+ readHeader(br);
+ parseAttributeStatements(br, names, types);
+
+ // Convert into column mapping. Prepare arrays to fill
+ int[] targ = new int[names.size()];
+ TypeInformation[] elkitypes = new TypeInformation[names.size()];
+ int[] dimsize = new int[names.size()];
+ processColumnTypes(names, types, targ, elkitypes, dimsize);
+
+ // Prepare bundle:
+ // This is a bit complicated to produce vector fields.
+ MultipleObjectsBundle bundle = new MultipleObjectsBundle();
+ StreamTokenizer tokenizer = makeArffTokenizer(br);
+
+ int state = 0;
+
+ nextToken(tokenizer);
+ while(tokenizer.ttype != StreamTokenizer.TT_EOF) {
+ // Parse instance
+ if(tokenizer.ttype == StreamTokenizer.TT_EOL) {
+ // ignore empty lines
+ }
+ else if(tokenizer.ttype != '{') {
+ if(state == 0) {
+ setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false);
+ state = 1; // dense
+ }
+ if(state != 1) {
+ throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
+ }
+ // Load a dense instance
+ bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength()));
+ }
+ else {
+ if(state == 0) {
+ setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true);
+ state = 2; // dense
+ }
+ if(state != 2) {
+ throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
+ }
+ bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength()));
+ }
+ if(tokenizer.ttype != StreamTokenizer.TT_EOF) {
+ nextToken(tokenizer);
+ }
+ }
+ return bundle;
+ }
+ catch(IOException e) {
+ throw new AbortException("IO error in parser", e);
+ }
+ }
+
+ private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
+ // logger.warning("Sparse instance.");
+ TIntObjectHashMap<Object> map = new TIntObjectHashMap<>();
+ while(true) {
+ nextToken(tokenizer);
+ assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
+ if(tokenizer.ttype == '}') {
+ nextToken(tokenizer);
+ assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
+ break;
+ }
+ else {
+ // sparse token
+ if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
+ throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
+ }
+ int dim = Integer.valueOf(tokenizer.sval);
+ if(map.containsKey(dim)) {
+ throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
+ }
+ nextToken(tokenizer);
+ if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]])) {
+ map.put(dim, FormatUtil.parseDouble(tokenizer.sval));
+ }
+ else {
+ map.put(dim, tokenizer.sval);
+ }
+ }
+ else {
+ throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
+ }
+ }
+ }
+ Object[] data = new Object[metaLength];
+ for(int out = 0; out < metaLength; out++) {
+ // Find the first index
+ int s = -1;
+ for(int i = 0; i < targ.length; i++) {
+ if(targ[i] == out && s < 0) {
+ s = i;
+ break;
+ }
+ }
+ assert (s >= 0);
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
+ TIntDoubleHashMap f = new TIntDoubleHashMap(dimsize[out]);
+ for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) {
+ iter.advance();
+ int i = iter.key();
+ if(i < s) {
+ continue;
+ }
+ if(i >= s + dimsize[out]) {
+ break;
+ }
+ double v = ((Double) iter.value()).doubleValue();
+ f.put(i - s, v);
+ }
+ data[out] = new SparseDoubleVector(f, dimsize[out]);
+ }
+ else if(TypeUtil.LABELLIST.equals(elkitypes[out])) {
+ // Build a label list out of successive labels
+ labels.clear();
+ for(TIntObjectIterator<Object> iter = map.iterator(); iter.hasNext();) {
+ iter.advance();
+ int i = iter.key();
+ if(i < s) {
+ continue;
+ }
+ if(i >= s + dimsize[out]) {
+ break;
+ }
+ String v = (String) iter.value();
+ if(labels.size() < i - s) {
+ LOG.warning("Sparse consecutive labels are currently not correctly supported.");
+ }
+ labels.add(v);
+ }
+ data[out] = LabelList.make(labels);
+ }
+ else if(TypeUtil.EXTERNALID.equals(elkitypes[out])) {
+ String val = (String) map.get(s);
+ if(val != null) {
+ data[out] = new ExternalID(val);
+ }
+ else {
+ throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
+ }
+ }
+ else if(TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
+ Object val = map.get(s);
+ if(val != null) {
+ // TODO: support other class label types.
+ ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
+ data[out] = lbl;
+ }
+ else {
+ throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
+ }
+ }
+ else {
+ throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
+ }
+ }
+ return data;
+ }
+
+ private Object[] loadDenseInstance(StreamTokenizer tokenizer, int[] dimsize, TypeInformation[] etyp, int outdim) throws IOException {
+ Object[] data = new Object[outdim];
+ for(int out = 0; out < outdim; out++) {
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
+ // For multi-column vectors, read successive columns
+ double[] cur = new double[dimsize[out]];
+ for(int k = 0; k < dimsize[out]; k++) {
+ if(tokenizer.ttype == '?') {
+ tokenizer.nval = Double.NaN;
+ }
+ else if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
+ try {
+ cur[k] = FormatUtil.parseDouble(tokenizer.sval);
+ }
+ catch(NumberFormatException e) {
+ throw new AbortException("Expected number value, got: " + tokenizer.sval);
+ }
+ }
+ else {
+ throw new AbortException("Expected word token, got: " + tokenizer.toString());
+ }
+ nextToken(tokenizer);
+ }
+ data[out] = new DoubleVector(cur);
+ }
+ else if(TypeUtil.LABELLIST.equals(etyp[out])) {
+ // Build a label list out of successive labels
+ labels.clear();
+ for(int k = 0; k < dimsize[out]; k++) {
+ if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
+ throw new AbortException("Expected word token, got: " + tokenizer.toString());
+ }
+ labels.add(tokenizer.sval);
+ nextToken(tokenizer);
+ }
+ data[out] = LabelList.make(labels);
+ }
+ else if(TypeUtil.EXTERNALID.equals(etyp[out])) {
+ if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
+ throw new AbortException("Expected word token, got: " + tokenizer.toString());
+ }
+ data[out] = new ExternalID(tokenizer.sval);
+ nextToken(tokenizer);
+ }
+ else if(TypeUtil.CLASSLABEL.equals(etyp[out])) {
+ if(tokenizer.ttype != StreamTokenizer.TT_WORD) {
+ throw new AbortException("Expected word token, got: " + tokenizer.toString());
+ }
+ // TODO: support other class label types.
+ ClassLabel lbl = new SimpleClassLabel(tokenizer.sval);
+ data[out] = lbl;
+ nextToken(tokenizer);
+ }
+ else {
+ throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
+ }
+ }
+ return data;
+ }
+
+ /**
+ * Make a StreamTokenizer for the ARFF format.
+ *
+ * @param br Buffered reader
+ * @return Tokenizer
+ */
+ private StreamTokenizer makeArffTokenizer(BufferedReader br) {
+ // Setup tokenizer
+ StreamTokenizer tokenizer = new StreamTokenizer(br);
+ {
+ tokenizer.resetSyntax();
+ tokenizer.whitespaceChars(0, ' ');
+ tokenizer.ordinaryChars('0', '9'); // Do not parse numbers
+ tokenizer.ordinaryChar('-');
+ tokenizer.ordinaryChar('.');
+ tokenizer.wordChars(' ' + 1, '\u00FF');
+ tokenizer.whitespaceChars(',', ',');
+ tokenizer.commentChar('%');
+ tokenizer.quoteChar('"');
+ tokenizer.quoteChar('\'');
+ tokenizer.ordinaryChar('{');
+ tokenizer.ordinaryChar('}');
+ tokenizer.eolIsSignificant(true);
+ }
+ return tokenizer;
+ }
+
+ /**
+ * Setup the headers for the object bundle.
+ *
+ * @param names Attribute names
+ * @param targ Target columns
+ * @param etyp ELKI type information
+ * @param dimsize Number of dimensions in the individual types
+ * @param bundle Output bundle
+ * @param sparse Flag to create sparse vectors
+ */
+ private void setupBundleHeaders(ArrayList<String> names, int[] targ, TypeInformation[] etyp, int[] dimsize, MultipleObjectsBundle bundle, boolean sparse) {
+ for(int in = 0, out = 0; in < targ.length; out++) {
+ int nin = in + 1;
+ for(; nin < targ.length; nin++) {
+ if(targ[nin] != targ[in]) {
+ break;
+ }
+ }
+ if(TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
+ String[] labels = new String[dimsize[out]];
+ // Collect labels:
+ for(int i = 0; i < dimsize[out]; i++) {
+ labels[i] = names.get(out + i);
+ }
+ if(!sparse) {
+ VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels);
+ bundle.appendColumn(type, new ArrayList<DoubleVector>());
+ }
+ else {
+ VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels);
+ bundle.appendColumn(type, new ArrayList<SparseDoubleVector>());
+ }
+ }
+ else if(TypeUtil.LABELLIST.equals(etyp[out])) {
+ StringBuilder label = new StringBuilder(names.get(out));
+ for(int i = 1; i < dimsize[out]; i++) {
+ label.append(' ').append(names.get(out + i));
+ }
+ bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>());
+ }
+ else if(TypeUtil.EXTERNALID.equals(etyp[out])) {
+ bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
+ }
+ else if(TypeUtil.CLASSLABEL.equals(etyp[out])) {
+ bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
+ }
+ else {
+ throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
+ }
+ assert (out == bundle.metaLength() - 1);
+ in = nin;
+ }
+ }
+
+ /**
+ * Read the dataset header part of the ARFF file, to ensure consistency.
+ *
+ * @param br Buffered Reader
+ * @throws IOException
+ */
+ private void readHeader(BufferedReader br) throws IOException {
+ String line;
+ // Locate header line
+ while(true) {
+ line = br.readLine();
+ if(line == null) {
+ throw new AbortException(ARFF_HEADER_RELATION + " not found in file.");
+ }
+ // Skip comments and empty lines
+ if(ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) {
+ continue;
+ }
+ // Break on relation statement
+ if(ARFF_HEADER_RELATION.reset(line).matches()) {
+ break;
+ }
+ throw new AbortException("Expected relation declaration: " + line);
+ }
+ }
+
+ /**
+ * Parse the "@attribute" section of the ARFF file.
+ *
+ * @param br Input
+ * @param names List (to fill) of attribute names
+ * @param types List (to fill) of attribute types
+ * @throws IOException
+ */
+ private void parseAttributeStatements(BufferedReader br, ArrayList<String> names, ArrayList<String> types) throws IOException {
+ String line;
+ // Load attribute metadata
+ while(true) {
+ line = br.readLine();
+ if(line == null) {
+ throw new AbortException(ARFF_HEADER_DATA + " not found in file.");
+ }
+ // Skip comments and empty lines
+ if(ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) {
+ continue;
+ }
+ // Break on data statement to continue
+ if(ARFF_HEADER_DATA.reset(line).matches()) {
+ break;
+ }
+ // Expect an attribute specification
+ Matcher matcher = ARFF_HEADER_ATTRIBUTE.reset(line);
+ if(matcher.matches()) {
+ String name = matcher.group(1);
+ if(name.charAt(0) == '\'' && name.charAt(name.length() - 1) == '\'') {
+ name = name.substring(1, name.length() - 1);
+ }
+ else if(name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') {
+ name = name.substring(1, name.length() - 1);
+ }
+ String type = matcher.group(2);
+ names.add(name);
+ types.add(type);
+ // logger.warning("Attribute name: " + name + " type: " + type);
+ continue;
+ }
+ throw new AbortException("Unrecognized line: " + line);
+ }
+ assert (names.size() == types.size());
+ }
+
+ /**
+ * Process the column types (and names!) into ELKI relation style. Note that
+ * this will for example merge successive numerical columns into a single
+ * vector.
+ *
+ * @param names Attribute names
+ * @param types Attribute types
+ * @param targ Target dimension mapping (ARFF to ELKI), return value
+ * @param etyp ELKI type information, return value
+ * @param dims Number of successive dimensions, return value
+ */
+ private void processColumnTypes(ArrayList<String> names, ArrayList<String> types, int[] targ, TypeInformation[] etyp, int[] dims) {
+ int next = 0;
+ for(int i = 0; i < targ.length; i++) {
+ if(magic_eid != null && magic_eid.reset(names.get(i)).matches()) {
+ // Turn into an external ID column.
+ targ[i] = next;
+ etyp[next] = TypeUtil.EXTERNALID;
+ dims[next] = 1;
+ next++;
+ continue;
+ }
+ else if(magic_class != null && magic_class.reset(names.get(i)).matches()) {
+ // Type as ClassLabel
+ targ[i] = next;
+ etyp[next] = TypeUtil.CLASSLABEL;
+ dims[next] = 1;
+ next++;
+ continue;
+ }
+ else if(ARFF_NUMERIC.reset(types.get(i)).matches()) {
+ // Create a number vector field
+ if(next > 0 && TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[next - 1])) {
+ targ[i] = next - 1;
+ dims[next - 1]++;
+ continue;
+ }
+ else {
+ targ[i] = next;
+ etyp[next] = TypeUtil.NUMBER_VECTOR_FIELD;
+ dims[next] = 1;
+ next++;
+ continue;
+ }
+ }
+ else {
+ // Use LabelList
+ if(next > 0 && TypeUtil.LABELLIST.equals(etyp[next - 1])) {
+ targ[i] = next - 1;
+ dims[next - 1]++;
+ continue;
+ }
+ else {
+ targ[i] = next;
+ etyp[next] = TypeUtil.LABELLIST;
+ dims[next] = 1;
+ next++;
+ continue;
+ }
+ }
+ }
+ }
+
+ /**
+ * Helper function for token handling.
+ *
+ * @param tokenizer Tokenizer
+ * @throws IOException
+ */
+ private void nextToken(StreamTokenizer tokenizer) throws IOException {
+ tokenizer.nextToken();
+ if((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
+ tokenizer.ttype = StreamTokenizer.TT_WORD;
+ }
+ else if((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) {
+ tokenizer.ttype = '?';
+ }
+ if(LOG.isDebugging()) {
+ if(tokenizer.ttype == StreamTokenizer.TT_NUMBER) {
+ LOG.debug("token: " + tokenizer.nval);
+ }
+ else if(tokenizer.ttype == StreamTokenizer.TT_WORD) {
+ LOG.debug("token: " + tokenizer.sval);
+ }
+ else if(tokenizer.ttype == StreamTokenizer.TT_EOF) {
+ LOG.debug("token: EOF");
+ }
+ else if(tokenizer.ttype == StreamTokenizer.TT_EOL) {
+ LOG.debug("token: EOL");
+ }
+ else {
+ LOG.debug("token type: " + tokenizer.ttype);
+ }
+ }
+ }
+
+ @Override
+ public void cleanup() {
+ if (magic_eid != null) {
+ magic_eid.reset("");
+ }
+ if (magic_class != null) {
+ magic_class.reset("");
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Pattern for recognizing external ID attributes.
+ */
+ public static final OptionID MAGIC_EID_ID = new OptionID("arff.externalid", "Pattern to recognize external ID attributes.");
+
+ /**
+ * Pattern for recognizing class label attributes.
+ */
+ public static final OptionID MAGIC_CLASS_ID = new OptionID("arff.classlabel", "Pattern to recognize class label attributes.");
+
+ /**
+ * Pattern to recognize external ids
+ */
+ Pattern magic_eid;
+
+ /**
+ * Pattern to recognize class label columns
+ */
+ Pattern magic_class;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ PatternParameter eidP = new PatternParameter(MAGIC_EID_ID, DEFAULT_ARFF_MAGIC_EID);
+ if(config.grab(eidP)) {
+ magic_eid = eidP.getValue();
+ }
+ PatternParameter classP = new PatternParameter(MAGIC_CLASS_ID, DEFAULT_ARFF_MAGIC_CLASS);
+ if(config.grab(classP)) {
+ magic_class = classP.getValue();
+ }
+ }
+
+ @Override
+ protected ArffParser makeInstance() {
+ return new ArffParser(magic_eid, magic_class);
+ }
+ }
+} \ No newline at end of file