diff options
author | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:41 +0000 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:41 +0000 |
commit | 38212b3127e90751fb39cda34250bc11be62b76c (patch) | |
tree | dc1397346030e9695bd763dddc93b3be527cd643 /elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java | |
parent | 337087b668d3a54f3afee3a9adb597a32e9f7e94 (diff) |
Import Upstream version 0.7.0
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java | 181 |
1 files changed, 181 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java new file mode 100644 index 00000000..ce639606 --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/datasource/ConcatenateFilesDatabaseConnection.java @@ -0,0 +1,181 @@ +package de.lmu.ifi.dbs.elki.datasource; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; +import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource; +import de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource.Event; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.datasource.filter.ObjectFilter; +import de.lmu.ifi.dbs.elki.datasource.parser.NumberVectorLabelParser; +import de.lmu.ifi.dbs.elki.datasource.parser.Parser; +import de.lmu.ifi.dbs.elki.datasource.parser.StreamingParser; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.FileUtil; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileListParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileListParameter.FilesType; + +/** + * Database that will loading multiple files, concatenating the results. + * + * @author Erich Schubert + */ +public class ConcatenateFilesDatabaseConnection extends AbstractDatabaseConnection { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(ConcatenateFilesDatabaseConnection.class); + + /** + * Input file list. + */ + private List<File> files; + + /** + * The parser. + */ + private Parser parser; + + /** + * Constructor. + * + * @param files Input files + * @param parser Parser + * @param filters Filters + */ + public ConcatenateFilesDatabaseConnection(List<File> files, Parser parser, List<ObjectFilter> filters) { + super(filters); + this.files = files; + this.parser = parser; + } + + @Override + public MultipleObjectsBundle loadData() { + MultipleObjectsBundle objects = new MultipleObjectsBundle(); + objects.appendColumn(TypeUtil.STRING, new ArrayList<>()); + for(File file : files) { + String filestr = file.getPath(); + try { + InputStream inputStream = new BufferedInputStream(new FileInputStream(file)); + inputStream = FileUtil.tryGzipInput(inputStream); + + final BundleStreamSource source; + if(parser instanceof StreamingParser) { + final StreamingParser streamParser = (StreamingParser) parser; + streamParser.initStream(inputStream); + source = streamParser; + } + else { + MultipleObjectsBundle parsingResult = parser.parse(inputStream); + // normalize objects and transform labels + source = parsingResult.asStream(); + } + BundleMeta meta = null; // NullPointerException on invalid streams + loop: for(Event e = source.nextEvent();; e = source.nextEvent()) { + switch(e){ + case END_OF_STREAM: + break loop; + case META_CHANGED: + meta = source.getMeta(); + for(int i = 0; i < meta.size(); i++) { + if(i + 1 >= objects.metaLength()) { + objects.appendColumn(meta.get(i), new ArrayList<>()); + } + else { + // Ensure compatibility: + if(!objects.meta(i + 1).isAssignableFromType(meta.get(i))) { + throw new AbortException("Incompatible files loaded. Cannot concatenate with unaligned columns, please preprocess manually."); + } + } + } + break; // switch + case NEXT_OBJECT: + Object[] o = new Object[objects.metaLength()]; + o[0] = filestr; + for(int i = 0; i < meta.size(); i++) { + o[i + 1] = source.data(i); + } + objects.appendSimple(o); + break; // switch + } + } + } + catch(IOException e) { + throw new AbortException("Loading file " + filestr + " failed: " + e.toString(), e); + } + } + parser.cleanup(); + // Invoke filters + if(LOG.isDebugging()) { + LOG.debugFine("Invoking filters."); + } + return invokeBundleFilters(objects); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractDatabaseConnection.Parameterizer { + /** + * The input files. + */ + private List<File> files; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + FileListParameter filesP = new FileListParameter(FileBasedDatabaseConnection.Parameterizer.INPUT_ID, FilesType.INPUT_FILES); + if(config.grab(filesP)) { + files = filesP.getValue(); + } + configFilters(config); + configParser(config, Parser.class, NumberVectorLabelParser.class); + } + + @Override + protected ConcatenateFilesDatabaseConnection makeInstance() { + return new ConcatenateFilesDatabaseConnection(files, parser, filters); + } + } +} |