package de.lmu.ifi.dbs.elki.datasource.parser;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.StringLengthConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter;
/**
* Abstract superclass for all parsers providing the option handler for handling
* options.
*
* @author Arthur Zimek
*/
public abstract class AbstractParser {
/**
* A pattern defining whitespace.
*/
public static final String DEFAULT_SEPARATOR = "(\\s+|\\s*[,;]\\s*)";
/**
* A quote pattern
*/
public static final char QUOTE_CHAR = '\"';
/**
* A pattern catching most numbers that can be parsed using Double.parseDouble:
*
* Some examples: 1
1.
1.2
* .2
-.2e-03
*/
public static final String NUMBER_PATTERN = "[+-]?(?:\\d+\\.?|\\d*\\.\\d+)?(?:[eE][-]?\\d+)?";
/**
* OptionID for the column separator parameter (defaults to whitespace as in
* {@link #DEFAULT_SEPARATOR}.
*/
public static final OptionID COLUMN_SEPARATOR_ID = OptionID.getOrCreateOptionID("parser.colsep", "Column separator pattern. The default assumes whitespace separated data.");
/**
* OptionID for the quote character parameter (defaults to a double quotation
* mark as in {@link #QUOTE_CHAR}.
*/
public static final OptionID QUOTE_ID = OptionID.getOrCreateOptionID("parser.quote", "Quotation character. The default is to use a double quote.");
/**
* Stores the column separator pattern
*/
private Pattern colSep = null;
/**
* Stores the quotation character
*/
protected char quoteChar = QUOTE_CHAR;
/**
* The comment character.
*/
public static final String COMMENT = "#";
/**
* A sign to separate attributes.
*/
public static final String ATTRIBUTE_CONCATENATION = " ";
/**
* Constructor.
*
* @param colSep Column separator
* @param quoteChar Quote character
*/
public AbstractParser(Pattern colSep, char quoteChar) {
super();
this.colSep = colSep;
this.quoteChar = quoteChar;
}
/**
* Tokenize a string. Works much like colSep.split() except it honors
* quotation characters.
*
* @param input Input string
* @return Tokenized string
*/
protected List tokenize(String input) {
ArrayList matchList = new ArrayList();
Matcher m = colSep.matcher(input);
int index = 0;
boolean inquote = (input.length() > 0) && (input.charAt(0) == quoteChar);
while(m.find()) {
// Quoted code path vs. regular code path
if(inquote && m.start() > 0) {
// Closing quote found?
if(m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) {
// Strip quote characters
if (index + 1 < m.start() - 1) {
matchList.add(input.substring(index + 1, m.start() - 1));
}
// Seek past
index = m.end();
// new quote?
inquote = (index < input.length()) && (input.charAt(index) == quoteChar);
}
}
else {
// Add match before separator
if (index < m.start()) {
matchList.add(input.substring(index, m.start()));
}
// Seek past separator
index = m.end();
// new quote?
inquote = (index < input.length()) && (input.charAt(index) == quoteChar);
}
}
// Nothing found - return original string.
if(index == 0) {
matchList.add(input);
return matchList;
}
// Add tail after last separator.
if(inquote) {
if(input.charAt(input.length() - 1) == quoteChar) {
if (index + 1 < input.length() - 1) {
matchList.add(input.substring(index + 1, input.length() - 1));
}
}
else {
getLogger().warning("Invalid quoted line in input.");
if (index < input.length()) {
matchList.add(input.substring(index, input.length()));
}
}
}
else {
if (index < input.length()) {
matchList.add(input.substring(index, input.length()));
}
}
// Return
return matchList;
}
/**
* Get the logger for this class.
*
* @return Logger.
*/
protected abstract Logging getLogger();
/**
* Returns a string representation of the object.
*
* @return a string representation of the object.
*/
@Override
public String toString() {
return getClass().getName();
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static abstract class Parameterizer extends AbstractParameterizer {
/**
* Stores the column separator pattern
*/
protected Pattern colSep = null;
/**
* Stores the quotation character
*/
protected char quoteChar = QUOTE_CHAR;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, DEFAULT_SEPARATOR);
if(config.grab(colParam)) {
colSep = colParam.getValue();
}
StringParameter quoteParam = new StringParameter(QUOTE_ID, new StringLengthConstraint(1, 1), ""+QUOTE_CHAR);
if(config.grab(quoteParam)) {
quoteChar = quoteParam.getValue().charAt(0);
}
}
@Override
protected abstract AbstractParser makeInstance();
}
}