diff options
author | Erich Schubert <erich@debian.org> | 2014-01-22 16:23:20 +0100 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:38 +0000 |
commit | cd98487b2f040496b2dd35cfb3e01e745cd5fba2 (patch) | |
tree | 78a9e71cbf436021b1e9601a44250511b37faa02 /src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java | |
parent | ec7f409f6e795bbcc6f3c005687954e9475c600c (diff) | |
parent | 14a486343aef55f97f54082d6b542dedebf6f3ba (diff) |
Import Debian changes 0.6.0-1
elki (0.6.0-1) unstable; urgency=low
* New upstream final.
* 3DPC extension is not included, but may be uploaded as a separate
package when there is actual need (it is a demo software, not meant
for use outside of research, so just get the source code!)
* Upgrade to policy 3.9.5.0 (no changes)
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java | 151 |
1 files changed, 27 insertions, 124 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java index 1e689638..e8201db1 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/parser/AbstractParser.java @@ -23,15 +23,11 @@ package de.lmu.ifi.dbs.elki.datasource.parser; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.StringLengthConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter; @@ -41,6 +37,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.StringParameter; * options. * * @author Arthur Zimek + * @author Erich Schubert + * + * @apiviz.composedOf Tokenizer */ public abstract class AbstractParser { /** @@ -51,7 +50,7 @@ public abstract class AbstractParser { /** * A quote pattern */ - public static final char QUOTE_CHAR = '\"'; + public static final String QUOTE_CHARS = "\"'"; /** * A pattern catching most numbers that can be parsed using @@ -73,96 +72,38 @@ public abstract class AbstractParser { public static final String ATTRIBUTE_CONCATENATION = " "; /** - * Stores the column separator pattern - */ - private Pattern colSep = null; - - /** - * Stores the quotation character + * Comment pattern. */ - protected char quoteChar = QUOTE_CHAR; + protected Pattern comment = null; /** - * Comment pattern. + * String tokenizer. */ - protected Pattern comment = null; + protected Tokenizer tokenizer; /** * Constructor. * * @param colSep Column separator - * @param quoteChar Quote character + * @param quoteChars Quote character * @param comment Comment pattern */ - public AbstractParser(Pattern colSep, char quoteChar, Pattern comment) { + public AbstractParser(Pattern colSep, String quoteChars, Pattern comment) { super(); - this.colSep = colSep; - this.quoteChar = quoteChar; + this.tokenizer = new Tokenizer(colSep, quoteChars); this.comment = comment; } - /** - * Tokenize a string. Works much like colSep.split() except it honors - * quotation characters. - * - * @param input Input string - * @return Tokenized string - */ - protected List<String> tokenize(String input) { - ArrayList<String> matchList = new ArrayList<>(); - Matcher m = colSep.matcher(input); - - int index = 0; - boolean inquote = (input.length() > 0) && (input.charAt(0) == quoteChar); - while (m.find()) { - // Quoted code path vs. regular code path - if (inquote && m.start() > 0) { - // Closing quote found? - if (m.start() > index + 1 && input.charAt(m.start() - 1) == quoteChar) { - // Strip quote characters - if (index + 1 < m.start() - 1) { - matchList.add(input.substring(index + 1, m.start() - 1)); - } - // Seek past - index = m.end(); - // new quote? - inquote = (index < input.length()) && (input.charAt(index) == quoteChar); - } - } else { - // Add match before separator - if (index < m.start()) { - matchList.add(input.substring(index, m.start())); - } - // Seek past separator - index = m.end(); - // new quote? - inquote = (index < input.length()) && (input.charAt(index) == quoteChar); + public static int lengthWithoutLinefeed(String line) { + int length = line.length(); + while(length > 0) { + char last = line.charAt(length - 1); + if(last != '\n' && last != '\r') { + break; } + --length; } - // Nothing found - return original string. - if (index == 0) { - matchList.add(input); - return matchList; - } - // Add tail after last separator. - if (inquote) { - if (input.charAt(input.length() - 1) == quoteChar) { - if (index + 1 < input.length() - 1) { - matchList.add(input.substring(index + 1, input.length() - 1)); - } - } else { - getLogger().warning("Invalid quoted line in input: no closing quote found in: " + input); - if (index < input.length()) { - matchList.add(input.substring(index, input.length())); - } - } - } else { - if (index < input.length()) { - matchList.add(input.substring(index, input.length())); - } - } - // Return - return matchList; + return length; } /** @@ -183,43 +124,6 @@ public abstract class AbstractParser { } /** - * Utility function, which is a bit more robust wrt. parsing double values. In - * particular: infinite values, and creates fewer objects. - * - * @param s String s - * @return parsed value - * @throws NumberFormatException - */ - public static double parseDouble(String s) throws NumberFormatException { - try { - return Double.parseDouble(s); - } catch (NumberFormatException e) { - int len = s.length(); - if (len > 0) { - int p = 0; - char cur = s.charAt(p); - boolean isNegative = cur == '-'; - if (isNegative && ++p < len) { - cur = s.charAt(p); - } - if (cur == '∞') { - return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; - } - if (len - p == 3 && "Inf".regionMatches(true, 0, s, p, 3)) { - return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; - } - if (len - p == 8 && "Infinity".regionMatches(true, 0, s, p, 8)) { - return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; - } - if (len == 3 && "NaN".equalsIgnoreCase(s)) { - return Double.NaN; - } - } - throw e; - } - } - - /** * Parameterization class. * * @author Erich Schubert @@ -235,9 +139,9 @@ public abstract class AbstractParser { /** * OptionID for the quote character parameter (defaults to a double - * quotation mark as in {@link #QUOTE_CHAR}. + * quotation mark as in {@link AbstractParser#QUOTE_CHARS}. */ - public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation character. The default is to use a double quote."); + public static final OptionID QUOTE_ID = new OptionID("parser.quote", "Quotation characters. By default, both double and single ASCII quotes are accepted."); /** * Comment pattern. @@ -252,7 +156,7 @@ public abstract class AbstractParser { /** * Stores the quotation character */ - protected char quoteChar = QUOTE_CHAR; + protected String quoteChars = QUOTE_CHARS; /** * Comment pattern. @@ -263,16 +167,15 @@ public abstract class AbstractParser { protected void makeOptions(Parameterization config) { super.makeOptions(config); PatternParameter colParam = new PatternParameter(COLUMN_SEPARATOR_ID, DEFAULT_SEPARATOR); - if (config.grab(colParam)) { + if(config.grab(colParam)) { colSep = colParam.getValue(); } - StringParameter quoteParam = new StringParameter(QUOTE_ID, String.valueOf(QUOTE_CHAR)); - quoteParam.addConstraint(new StringLengthConstraint(1, 1)); - if (config.grab(quoteParam)) { - quoteChar = quoteParam.getValue().charAt(0); + StringParameter quoteParam = new StringParameter(QUOTE_ID, QUOTE_CHARS); + if(config.grab(quoteParam)) { + quoteChars = quoteParam.getValue(); } PatternParameter commentP = new PatternParameter(COMMENT_ID, COMMENT_PATTERN); - if (config.grab(commentP)) { + if(config.grab(commentP)) { comment = commentP.getValue(); } } |