summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
diff options
context:
space:
mode:
authorErich Schubert <erich@debian.org>2014-01-22 16:23:20 +0100
committerAndrej Shadura <andrewsh@debian.org>2019-03-09 22:30:38 +0000
commitcd98487b2f040496b2dd35cfb3e01e745cd5fba2 (patch)
tree78a9e71cbf436021b1e9601a44250511b37faa02 /src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
parentec7f409f6e795bbcc6f3c005687954e9475c600c (diff)
parent14a486343aef55f97f54082d6b542dedebf6f3ba (diff)
Import Debian changes 0.6.0-1
elki (0.6.0-1) unstable; urgency=low * New upstream final. * 3DPC extension is not included, but may be uploaded as a separate package when there is actual need (it is a demo software, not meant for use outside of research, so just get the source code!) * Upgrade to policy 3.9.5.0 (no changes)
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java151
1 files changed, 27 insertions, 124 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
index 1e689638..e8201db1 100644
--- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
+++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java
@@ -23,15 +23,11 @@ package de.lmu.ifi.dbs.elki.datasource.parser;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.StringLengthConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter;
@@ -41,6 +37,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter;
* options.
*
* @author Arthur Zimek
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf Tokenizer
*/
public abstract class AbstractParser {
/**
@@ -51,7 +50,7 @@ public abstract class AbstractParser {
/**
* A quote pattern
*/
- public static final char QUOTE_CHAR = '\"';
+ public static final String QUOTE_CHARS = "\"'";
/**
* A pattern catching most numbers that can be parsed using
@@ -73,96 +72,38 @@ public abstract class AbstractParser {
public static final String ATTRIBUTE_CONCATENATION = " ";
/**
- * Stores the column separator pattern
- */
- private Pattern colSep = null;
-
- /**
- * Stores the quotation character
+ * Comment pattern.
*/
- protected char quoteChar = QUOTE_CHAR;
+ protected Pattern comment = null;
/**
- * Comment pattern.
+ * String tokenizer.
*/
- protected Pattern comment = null;
+ protected Tokenizer tokenizer;
/**
* Constructor.
*
* @param colSep Column separator
- * @param quoteChar Quote character
+ * @param quoteChars Quote character
* @param comment Comment pattern
*/
- public AbstractParser(Pattern colSep, char quoteChar, Pattern comment) {
+ public AbstractParser(Pattern colSep, String quoteChars, Pattern comment) {
super();
- this.colSep = colSep;
- this.quoteChar = quoteChar;
+ this.tokenizer = new Tokenizer(colSep, quoteChars);
this.comment = comment;
}
- /**
- * Tokenize a string. Works much like colSep.split() except it honors
- * quotation characters.
- *
- * @param input Input string
- * @return Tokenized string
- */
- protected List<String> tokenize(String input) {
- ArrayList<String> matchList = new ArrayList<>();
- Matcher m = colSep.matcher(input);
-
- int index = 0;
- boolean inquote = (input.length() > 0) && (input.charAt(0) == quoteChar);
- while (m.find()) {
- // Quoted code path vs. regular code path
- if (inquote && m.start() > 0) {
- // Closing quote found?
- if (m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) {
- // Strip quote characters
- if (index + 1 < m.start() - 1) {
- matchList.add(input.substring(index + 1, m.start() - 1));
- }
- // Seek past
- index = m.end();
- // new quote?
- inquote = (index < input.length()) && (input.charAt(index) == quoteChar);
- }
- } else {
- // Add match before separator
- if (index < m.start()) {
- matchList.add(input.substring(index, m.start()));
- }
- // Seek past separator
- index = m.end();
- // new quote?
- inquote = (index < input.length()) && (input.charAt(index) == quoteChar);
+ public static int lengthWithoutLinefeed(String line) {
+ int length = line.length();
+ while(length > 0) {
+ char last = line.charAt(length - 1);
+ if(last != '\n' && last != '\r') {
+ break;
}
+ --length;
}
- // Nothing found - return original string.
- if (index == 0) {
- matchList.add(input);
- return matchList;
- }
- // Add tail after last separator.
- if (inquote) {
- if (input.charAt(input.length() - 1) == quoteChar) {
- if (index + 1 < input.length() - 1) {
- matchList.add(input.substring(index + 1, input.length() - 1));
- }
- } else {
- getLogger().warning("Invalid quoted line in input: no closing quote found in: " + input);
- if (index < input.length()) {
- matchList.add(input.substring(index, input.length()));
- }
- }
- } else {
- if (index < input.length()) {
- matchList.add(input.substring(index, input.length()));
- }
- }
- // Return
- return matchList;
+ return length;
}
/**
@@ -183,43 +124,6 @@ public abstract class AbstractParser {
}
/**
- * Utility function, which is a bit more robust wrt. parsing double values. In
- * particular: infinite values, and creates fewer objects.
- *
- * @param s String s
- * @return parsed value
- * @throws NumberFormatException
- */
- public static double parseDouble(String s) throws NumberFormatException {
- try {
- return Double.parseDouble(s);
- } catch (NumberFormatException e) {
- int len = s.length();
- if (len > 0) {
- int p = 0;
- char cur = s.charAt(p);
- boolean isNegative = cur == '-';
- if (isNegative && ++p < len) {
- cur = s.charAt(p);
- }
- if (cur == '∞') {
- return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
- }
- if (len - p == 3 && "Inf".regionMatches(true, 0, s, p, 3)) {
- return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
- }
- if (len - p == 8 && "Infinity".regionMatches(true, 0, s, p, 8)) {
- return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
- }
- if (len == 3 && "NaN".equalsIgnoreCase(s)) {
- return Double.NaN;
- }
- }
- throw e;
- }
- }
-
- /**
* Parameterization class.
*
* @author Erich Schubert
@@ -235,9 +139,9 @@ public abstract class AbstractParser {
/**
* OptionID for the quote character parameter (defaults to a double
- * quotation mark as in {@link #QUOTE_CHAR}.
+ * quotation mark as in {@link AbstractParser#QUOTE_CHARS}.
*/
- public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote.");
+ public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation characters. By default, both double and single ASCII quotes are accepted.");
/**
* Comment pattern.
@@ -252,7 +156,7 @@ public abstract class AbstractParser {
/**
* Stores the quotation character
*/
- protected char quoteChar = QUOTE_CHAR;
+ protected String quoteChars = QUOTE_CHARS;
/**
* Comment pattern.
@@ -263,16 +167,15 @@ public abstract class AbstractParser {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, DEFAULT_SEPARATOR);
- if (config.grab(colParam)) {
+ if(config.grab(colParam)) {
colSep = colParam.getValue();
}
- StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR));
- quoteParam.addConstraint(new StringLengthConstraint(1, 1));
- if (config.grab(quoteParam)) {
- quoteChar = quoteParam.getValue().charAt(0);
+ StringParameter quoteParam = new StringParameter(QUOTE_ID, QUOTE_CHARS);
+ if(config.grab(quoteParam)) {
+ quoteChars = quoteParam.getValue();
}
PatternParameter commentP = new PatternParameter(COMMENT_ID, COMMENT_PATTERN);
- if (config.grab(commentP)) {
+ if(config.grab(commentP)) {
comment = commentP.getValue();
}
}