diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/application')
23 files changed, 1570 insertions, 145 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/application/AbstractApplication.java b/src/de/lmu/ifi/dbs/elki/application/AbstractApplication.java index c2f19ef5..2775c928 100644 --- a/src/de/lmu/ifi/dbs/elki/application/AbstractApplication.java +++ b/src/de/lmu/ifi/dbs/elki/application/AbstractApplication.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -75,7 +75,7 @@ public abstract class AbstractApplication implements Parameterizable { /** * Information for citation and version. */ - public static final String INFORMATION = "ELKI Version 0.4 (2011, August)" + NEWLINE + NEWLINE + "published in:" + NEWLINE + "E. Achtert, A. Hettab, H.-P. Kriegel, E. Schubert, A. Zimek:" + NEWLINE + "Spatial Outlier Detection: Data, Algorithms, Visualizations." + NEWLINE + "In Proceedings of the 12th International Symposium on" + NEWLINE + "Spatial and Temporal Databases (SSTD), Minneapolis, MN, 2011." + NEWLINE; + public static final String INFORMATION = "ELKI Version 0.5.0~beta1 (2012, April)" + NEWLINE + NEWLINE + "published in:" + NEWLINE + "E. Achtert, S. Goldhofer, H.-P. Kriegel, E. Schubert, A. Zimek:" + NEWLINE + "Evaluation of Clusterings – Metrics and Visual Support." + NEWLINE + "In Proceedings of the 28th"+NEWLINE+"International Conference on Data Engineering (ICDE), Washington, DC, 2012." + NEWLINE; /** * Parameter that specifies the name of the output file. diff --git a/src/de/lmu/ifi/dbs/elki/application/ComputeSingleColorHistogram.java b/src/de/lmu/ifi/dbs/elki/application/ComputeSingleColorHistogram.java index 63fc5d63..e6fe5231 100644 --- a/src/de/lmu/ifi/dbs/elki/application/ComputeSingleColorHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/application/ComputeSingleColorHistogram.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -58,8 +58,15 @@ public class ComputeSingleColorHistogram extends AbstractApplication { * Key: {@code -colorhist.in} * </p> */ + public static final OptionID INPUT_ID = OptionID.getOrCreateOptionID("colorhist.in", "Input image file for color histogram."); - public static final OptionID INPUT_ID = OptionID.getOrCreateOptionID("colorhist.in", "Input image for color histograms."); + /** + * Parameter that specifies the name of the mask input file. + * <p> + * Key: {@code -colorhist.mask} + * </p> + */ + public static final OptionID MASK_ID = OptionID.getOrCreateOptionID("colorhist.mask", "Input mask image file."); /** * Class that will compute the actual histogram @@ -72,23 +79,30 @@ public class ComputeSingleColorHistogram extends AbstractApplication { private File inputFile; /** + * Mask file. + */ + private File maskFile; + + /** * Constructor. * * @param verbose Verbose flag * @param histogrammaker Class to compute histograms with * @param inputFile Input file + * @param maskFile Mask file */ - public ComputeSingleColorHistogram(boolean verbose, ComputeColorHistogram histogrammaker, File inputFile) { + public ComputeSingleColorHistogram(boolean verbose, ComputeColorHistogram histogrammaker, File inputFile, File maskFile) { super(verbose); this.histogrammaker = histogrammaker; this.inputFile = inputFile; + this.maskFile = maskFile; } @Override public void run() throws UnableToComplyException { double[] hist; try { - hist = histogrammaker.computeColorHistogram(inputFile); + hist = histogrammaker.computeColorHistogram(inputFile, maskFile); } catch(IOException e) { throw new UnableToComplyException(e); @@ -114,6 +128,11 @@ public class ComputeSingleColorHistogram extends AbstractApplication { */ private File inputFile; + /** + * Mask file. + */ + private File maskFile; + @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); @@ -125,11 +144,15 @@ public class ComputeSingleColorHistogram extends AbstractApplication { if(config.grab(inputP)) { inputFile = inputP.getValue(); } + final FileParameter maskP = new FileParameter(MASK_ID, FileParameter.FileType.INPUT_FILE, true); + if(config.grab(maskP)) { + maskFile = maskP.getValue(); + } } @Override protected ComputeSingleColorHistogram makeInstance() { - return new ComputeSingleColorHistogram(verbose, histogrammaker, inputFile); + return new ComputeSingleColorHistogram(verbose, histogrammaker, inputFile, maskFile); } } diff --git a/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java b/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java index 373960a5..ccee6d39 100644 --- a/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java +++ b/src/de/lmu/ifi/dbs/elki/application/GeneratorXMLSpec.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,22 +23,30 @@ package de.lmu.ifi.dbs.elki.application; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.iterator.TIntIterator; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; + import java.io.File; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; -import java.util.List; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; -import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface; -import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterfaceDynamic; +import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorSingleCluster; -import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.distribution.Distribution; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.datasource.GeneratorXMLDatabaseConnection; import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -124,27 +132,52 @@ public class GeneratorXMLSpec extends AbstractApplication { * @throws IOException thrown on write errors */ public void writeClusters(OutputStreamWriter outStream, MultipleObjectsBundle data) throws IOException { - List<GeneratorInterface> clusters = generator.gen.getGenerators(); + int modelcol = -1; + { // Find model column + for(int i = 0; i < data.metaLength(); i++) { + if(TypeUtil.MODEL.isAssignableFromType(data.meta(i))) { + modelcol = i; + break; + } + } + } + if(modelcol < 0) { + throw new AbortException("No model column found in bundle."); + } + ArrayList<Model> models = new ArrayList<Model>(); + Map<Model, TIntList> modelMap = new HashMap<Model, TIntList>(); + { // Build a map from model to the actual objects + for(int i = 0; i < data.dataLength(); i++) { + Model model = (Model) data.data(i, modelcol); + TIntList modelids = modelMap.get(model); + if(modelids == null) { + models.add(model); + modelids = new TIntArrayList(); + modelMap.put(model, modelids); + } + modelids.add(i); + } + } // compute global discard values int totalsize = 0; int totaldisc = 0; - assert (clusters.size() > 0); - for(GeneratorInterface curclus : clusters) { - totalsize = totalsize + curclus.getSize(); - if(curclus instanceof GeneratorSingleCluster) { - totaldisc = totaldisc + ((GeneratorSingleCluster) curclus).getDiscarded(); + for(Entry<Model, TIntList> ent : modelMap.entrySet()) { + totalsize = totalsize + ent.getValue().size(); + if(ent.getKey() instanceof GeneratorSingleCluster) { + totaldisc = totaldisc + ((GeneratorSingleCluster) ent.getKey()).getDiscarded(); } } double globdens = (double) (totalsize + totaldisc) / totalsize; outStream.write("########################################################" + LINE_SEPARATOR); - outStream.write("## Number of clusters: " + clusters.size() + LINE_SEPARATOR); - for(GeneratorInterface curclus : clusters) { + outStream.write("## Number of clusters: " + models.size() + LINE_SEPARATOR); + for(Model model : models) { + TIntList ids = modelMap.get(model); outStream.write("########################################################" + LINE_SEPARATOR); - outStream.write("## Cluster: " + curclus.getName() + LINE_SEPARATOR); - outStream.write("########################################################" + LINE_SEPARATOR); - outStream.write("## Size: " + curclus.getSize() + LINE_SEPARATOR); - if(curclus instanceof GeneratorSingleCluster) { - GeneratorSingleCluster cursclus = (GeneratorSingleCluster) curclus; + outStream.write("## Size: " + ids.size() + LINE_SEPARATOR); + if(model instanceof GeneratorSingleCluster) { + GeneratorSingleCluster cursclus = (GeneratorSingleCluster) model; + outStream.write("########################################################" + LINE_SEPARATOR); + outStream.write("## Cluster: " + cursclus.getName() + LINE_SEPARATOR); Vector cmin = cursclus.getClipmin(); Vector cmax = cursclus.getClipmax(); if(cmin != null && cmax != null) { @@ -152,27 +185,30 @@ public class GeneratorXMLSpec extends AbstractApplication { } outStream.write("## Density correction factor: " + cursclus.getDensityCorrection() + LINE_SEPARATOR); outStream.write("## Generators:" + LINE_SEPARATOR); - for(Distribution gen : cursclus.getAxes()) { + for(int i = 0; i < cursclus.getDim(); i++) { + Distribution gen = cursclus.getDistribution(i); outStream.write("## " + gen.toString() + LINE_SEPARATOR); } - if(cursclus.getTrans() != null && cursclus.getTrans().getTransformation() != null) { + if(cursclus.getTransformation() != null && cursclus.getTransformation().getTransformation() != null) { outStream.write("## Affine transformation matrix:" + LINE_SEPARATOR); - outStream.write(FormatUtil.format(cursclus.getTrans().getTransformation(), "## ") + LINE_SEPARATOR); + outStream.write(FormatUtil.format(cursclus.getTransformation().getTransformation(), "## ") + LINE_SEPARATOR); } - } - if(curclus instanceof GeneratorInterfaceDynamic) { - GeneratorSingleCluster cursclus = (GeneratorSingleCluster) curclus; outStream.write("## Discards: " + cursclus.getDiscarded() + " Retries left: " + cursclus.getRetries() + LINE_SEPARATOR); double corf = /* cursclus.overweight */(double) (cursclus.getSize() + cursclus.getDiscarded()) / cursclus.getSize() / globdens; outStream.write("## Density correction factor estimation: " + corf + LINE_SEPARATOR); } outStream.write("########################################################" + LINE_SEPARATOR); - for(Vector p : curclus.getPoints()) { - for(int i = 0; i < p.getRowDimensionality(); i++) { - outStream.write(p.get(i) + " "); + for(TIntIterator iter = ids.iterator(); iter.hasNext();) { + int num = iter.next(); + for(int c = 0; c < data.metaLength(); c++) { + if(c != modelcol) { + if (c > 0) { + outStream.write(" "); + } + outStream.write(data.data(num, c).toString()); + } } - outStream.write(curclus.getName()); outStream.write(LINE_SEPARATOR); } } diff --git a/src/de/lmu/ifi/dbs/elki/application/KDDCLIApplication.java b/src/de/lmu/ifi/dbs/elki/application/KDDCLIApplication.java index 59f75a7b..9184df58 100644 --- a/src/de/lmu/ifi/dbs/elki/application/KDDCLIApplication.java +++ b/src/de/lmu/ifi/dbs/elki/application/KDDCLIApplication.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java b/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java index 62080b51..c2e575be 100644 --- a/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java +++ b/src/de/lmu/ifi/dbs/elki/application/cache/CacheDoubleDistanceInOnDiskMatrix.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.cache; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -119,7 +119,7 @@ public class CacheDoubleDistanceInOnDiskMatrix<O, D extends NumberDistance<D, ?> DistanceQuery<O, D> distanceQuery = database.getDistanceQuery(relation, distance); int matrixsize = 0; - for(DBID id : distanceQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { matrixsize = Math.max(matrixsize, id.getIntegerID() + 1); if(id.getIntegerID() < 0) { throw new AbortException("OnDiskMatrixCache does not allow negative DBIDs."); @@ -134,8 +134,8 @@ public class CacheDoubleDistanceInOnDiskMatrix<O, D extends NumberDistance<D, ?> throw new AbortException("Error creating output matrix.", e); } - for(DBID id1 : distanceQuery.getRelation().iterDBIDs()) { - for(DBID id2 : distanceQuery.getRelation().iterDBIDs()) { + for(DBID id1 : relation.iterDBIDs()) { + for(DBID id2 : relation.iterDBIDs()) { if(id2.getIntegerID() >= id1.getIntegerID()) { double d = distanceQuery.distance(id1, id2).doubleValue(); if(debugExtraCheckSymmetry) { diff --git a/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java b/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java index 7c99779d..237aedb9 100644 --- a/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java +++ b/src/de/lmu/ifi/dbs/elki/application/cache/CacheFloatDistanceInOnDiskMatrix.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.cache; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -124,7 +124,7 @@ public class CacheFloatDistanceInOnDiskMatrix<O, D extends NumberDistance<D, ?>> DistanceQuery<O, D> distanceQuery = database.getDistanceQuery(relation, distance); int matrixsize = 0; - for(DBID id : distanceQuery.getRelation().iterDBIDs()) { + for(DBID id : relation.iterDBIDs()) { matrixsize = Math.max(matrixsize, id.getIntegerID() + 1); if(id.getIntegerID() < 0) { throw new AbortException("OnDiskMatrixCache does not allow negative DBIDs."); @@ -139,8 +139,8 @@ public class CacheFloatDistanceInOnDiskMatrix<O, D extends NumberDistance<D, ?>> throw new AbortException("Error creating output matrix.", e); } - for(DBID id1 : distanceQuery.getRelation().iterDBIDs()) { - for(DBID id2 : distanceQuery.getRelation().iterDBIDs()) { + for(DBID id1 : relation.iterDBIDs()) { + for(DBID id2 : relation.iterDBIDs()) { if(id2.getIntegerID() >= id1.getIntegerID()) { float d = distanceQuery.distance(id1, id2).floatValue(); if(debugExtraCheckSymmetry) { diff --git a/src/de/lmu/ifi/dbs/elki/application/cache/package-info.java b/src/de/lmu/ifi/dbs/elki/application/cache/package-info.java index 17013c55..5ed7ec1c 100644 --- a/src/de/lmu/ifi/dbs/elki/application/cache/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/application/cache/package-info.java @@ -11,7 +11,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java new file mode 100644 index 00000000..f6506fcc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java @@ -0,0 +1,468 @@ +package de.lmu.ifi.dbs.elki.application.greedyensemble; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.ABOD; +import de.lmu.ifi.dbs.elki.algorithm.outlier.KNNOutlier; +import de.lmu.ifi.dbs.elki.algorithm.outlier.KNNWeightOutlier; +import de.lmu.ifi.dbs.elki.algorithm.outlier.LDOF; +import de.lmu.ifi.dbs.elki.algorithm.outlier.LOF; +import de.lmu.ifi.dbs.elki.algorithm.outlier.LoOP; +import de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.ByLabelOutlier; +import de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialAllOutlier; +import de.lmu.ifi.dbs.elki.algorithm.outlier.trivial.TrivialNoOutlier; +import de.lmu.ifi.dbs.elki.application.AbstractApplication; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.PolynomialKernelFunction; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultHierarchy; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.utilities.Base64; +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.scaling.IdentityScaling; +import de.lmu.ifi.dbs.elki.utilities.scaling.ScalingFunction; +import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.MinusLogStandardDeviationScaling; +import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.StandardDeviationScaling; +import de.lmu.ifi.dbs.elki.workflow.InputStep; + +/** + * Application that runs a series of kNN-based algorithms on a data set, for + * building an ensemble in a second step. The output file consists of a label + * and one score value for each object. + * + * Reference: + * <p> + * E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br /> + * On Evaluation of Outlier Rankings and Outlier Scores<br/> + * In Proceedings of the 12th SIAM International Conference on Data Mining + * (SDM), Anaheim, CA, 2012. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.") +public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends AbstractApplication { + /** + * Our logger class. + */ + private static final Logging logger = Logging.getLogger(ComputeKNNOutlierScores.class); + + /** + * Input step + */ + final InputStep inputstep; + + /** + * Distance function to use + */ + final DistanceFunction<? super O, D> distf; + + /** + * Starting value of k. + */ + final int startk; + + /** + * k step size + */ + final int stepk; + + /** + * Maximum value of k + */ + final int maxk; + + /** + * Output file + */ + File outfile; + + /** + * By label outlier detection - reference + */ + ByLabelOutlier bylabel; + + /** + * Include ABOD in the experiments. + */ + boolean runabod = false; + + /** + * Constructor. + * + * @param verbose Verbose flag + * @param inputstep Input step + * @param distf Distance function + * @param startk Starting value of k + * @param stepk K step size + * @param maxk Maximum k value + * @param bylabel By label outlier (reference) + * @param outfile Output file + */ + public ComputeKNNOutlierScores(boolean verbose, InputStep inputstep, DistanceFunction<? super O, D> distf, int startk, int stepk, int maxk, ByLabelOutlier bylabel, File outfile) { + super(verbose); + this.distf = distf; + this.startk = startk; + this.stepk = stepk; + this.maxk = maxk; + this.inputstep = inputstep; + this.bylabel = bylabel; + this.outfile = outfile; + } + + @Override + public void run() { + final Database database = inputstep.getDatabase(); + final Relation<O> relation = database.getRelation(distf.getInputTypeRestriction()); + logger.verbose("Running preprocessor ..."); + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<O, D>(relation, distf, maxk + 2); + database.addIndex(preproc); + + // Test that we did get a proper index query + KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, distf); + if(!(knnq instanceof PreprocessorKNNQuery)) { + logger.warning("Not using preprocessor knn query -- KNN queries using class: " + knnq.getClass()); + } + + final DBIDs ids = relation.getDBIDs(); + + final PrintStream fout; + try { + fout = new PrintStream(outfile); + } + catch(FileNotFoundException e) { + throw new AbortException("Cannot create output file.", e); + } + // Control: print the DBIDs in case we are seeing an odd iteration + { + try { + MessageDigest md = MessageDigest.getInstance("MD5"); + for(DBID id : ids) { + md.update(" ".getBytes()); + md.update(id.toString().getBytes()); + } + fout.append("# DBID-series MD5:"); + fout.append(Base64.encodeBase64(md.digest())); + fout.append(FormatUtil.NEWLINE); + } + catch(NoSuchAlgorithmException e) { + throw new AbortException("MD5 not found."); + } + } + + // Label outlier result (reference) + { + OutlierResult bylabelresult = bylabel.run(database); + writeResult(fout, ids, bylabelresult, new IdentityScaling(), "bylabel"); + } + // No/all outliers "results" + { + OutlierResult noresult = (new TrivialNoOutlier()).run(database); + writeResult(fout, ids, noresult, new IdentityScaling(), "no-outliers"); + OutlierResult allresult = (new TrivialAllOutlier()).run(database); + writeResult(fout, ids, allresult, new IdentityScaling(), "all-outliers"); + } + + // KNN + logger.verbose("Running KNN"); + runForEachK(new AlgRunner() { + @Override + public void run(int k, String kstr) { + KNNOutlier<O, D> knn = new KNNOutlier<O, D>(distf, k); + OutlierResult knnresult = knn.run(database, relation); + // Setup scaling + StandardDeviationScaling scaling = new StandardDeviationScaling(); + scaling.prepare(knnresult); + writeResult(fout, ids, knnresult, scaling, "KNN-" + kstr); + detachResult(database, knnresult); + } + }); + // KNN Weight + logger.verbose("Running KNNweight"); + runForEachK(new AlgRunner() { + @Override + public void run(int k, String kstr) { + KNNWeightOutlier<O, D> knnw = new KNNWeightOutlier<O, D>(distf, k); + OutlierResult knnresult = knnw.run(database, relation); + // Setup scaling + StandardDeviationScaling scaling = new StandardDeviationScaling(); + scaling.prepare(knnresult); + writeResult(fout, ids, knnresult, scaling, "KNNW-" + kstr); + detachResult(database, knnresult); + } + }); + // Run LOF + logger.verbose("Running LOF"); + runForEachK(new AlgRunner() { + @Override + public void run(int k, String kstr) { + LOF<O, D> lof = new LOF<O, D>(k, distf, distf); + OutlierResult lofresult = lof.run(relation); + // Setup scaling + StandardDeviationScaling scaling = new StandardDeviationScaling(1.0, 1.0); + scaling.prepare(lofresult); + writeResult(fout, ids, lofresult, scaling, "LOF-" + kstr); + detachResult(database, lofresult); + } + }); + // LoOP + logger.verbose("Running LoOP"); + runForEachK(new AlgRunner() { + @Override + public void run(int k, String kstr) { + LoOP<O, D> loop = new LoOP<O, D>(k, k, distf, distf, 1.0); + OutlierResult loopresult = loop.run(database, relation); + writeResult(fout, ids, loopresult, new IdentityScaling(), "LOOP-" + kstr); + detachResult(database, loopresult); + } + }); + // LDOF + logger.verbose("Running LDOF"); + runForEachK(new AlgRunner() { + @Override + public void run(int k, String kstr) { + LDOF<O, D> ldof = new LDOF<O, D>(distf, k); + OutlierResult ldofresult = ldof.run(database, relation); + // Setup scaling + StandardDeviationScaling scaling = new StandardDeviationScaling(1.0, 1.0); + scaling.prepare(ldofresult); + writeResult(fout, ids, ldofresult, scaling, "LDOF-" + kstr); + detachResult(database, ldofresult); + } + }); + // ABOD + if(runabod && relation.size() < 10000) { + try { + final PolynomialKernelFunction poly = new PolynomialKernelFunction(PolynomialKernelFunction.DEFAULT_DEGREE); + @SuppressWarnings("unchecked") + final DistanceFunction<DoubleVector, DoubleDistance> df = DistanceFunction.class.cast(distf); + logger.verbose("Running ABOD"); + runForEachK(new AlgRunner() { + @Override + public void run(int k, String kstr) { + ABOD<DoubleVector> abod = new ABOD<DoubleVector>(k, poly, df); + OutlierResult abodresult = abod.run(database); + // Setup scaling + StandardDeviationScaling scaling = new MinusLogStandardDeviationScaling(null, 1.0); + scaling.prepare(abodresult); + writeResult(fout, ids, abodresult, scaling, "ABOD-" + kstr); + detachResult(database, abodresult); + } + }); + } + catch(ClassCastException e) { + // ABOD might just be not appropriate. + logger.warning("Running ABOD failed - probably not appropriate to this data type / distance?", e); + } + } + } + + /** + * Avoid that (future changes?) keep a reference to the result. + * + * @param database Database + * @param discardresult Result to discard. + */ + void detachResult(Database database, OutlierResult discardresult) { + final ResultHierarchy hier = database.getHierarchy(); + for(Result parent : hier.getParents(discardresult)) { + hier.remove(parent, discardresult); + } + } + + /** + * Write a single output line. + * + * @param out Output stream + * @param ids DBIDs + * @param result Outlier result + * @param scaling Scaling function. + * @param label Identification label + */ + void writeResult(PrintStream out, DBIDs ids, OutlierResult result, ScalingFunction scaling, String label) { + out.append(label); + Relation<Double> scores = result.getScores(); + for(DBID id : ids) { + final double value = scaling.getScaled(scores.get(id)); + out.append(" ").append(FormatUtil.format(value, FormatUtil.NF8)); + } + out.append(FormatUtil.NEWLINE); + } + + /** + * Iterate over the k range. + * + * @param runner Runner to run + */ + private void runForEachK(AlgRunner runner) { + final int digits = (int) Math.ceil(Math.log10(maxk)); + final int startk = (this.startk > 0) ? this.startk : this.stepk; + for(int k = startk; k <= maxk; k += stepk) { + String kstr = String.format("%0" + digits + "d", k); + runner.run(k, kstr); + } + } + + /** + * Run an algorithm for a given k. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private interface AlgRunner { + public void run(int k, String kstr); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractApplication.Parameterizer { + /** + * Option ID for k step size. + */ + public static final OptionID STEPK_ID = OptionID.getOrCreateOptionID("stepk", "Step size for k."); + + /** + * Option ID for k start size. + */ + public static final OptionID STARTK_ID = OptionID.getOrCreateOptionID("startk", "Minimum value for k."); + + /** + * Option ID for k step size. + */ + public static final OptionID MAXK_ID = OptionID.getOrCreateOptionID("maxk", "Maximum value for k."); + + /** + * k step size + */ + int stepk; + + /** + * starting value of k + */ + int startk; + + /** + * Maximum value of k + */ + int maxk; + + /** + * Data source + */ + InputStep inputstep; + + /** + * Distance function to use + */ + DistanceFunction<? super O, D> distf; + + /** + * By label outlier -- reference + */ + ByLabelOutlier bylabel; + + /** + * Output destination file + */ + File outfile; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Data input + inputstep = config.tryInstantiate(InputStep.class); + // Distance function + ObjectParameter<DistanceFunction<? super O, D>> distP = AbstractAlgorithm.makeParameterDistanceFunction(EuclideanDistanceFunction.class, DistanceFunction.class); + if(config.grab(distP)) { + distf = distP.instantiateClass(config); + } + // k parameters + IntParameter stepkP = new IntParameter(STEPK_ID, new GreaterConstraint(0)); + if(config.grab(stepkP)) { + stepk = stepkP.getValue(); + } + IntParameter startkP = new IntParameter(STARTK_ID, true); + if(config.grab(startkP)) { + startk = startkP.getValue(); + } + else { + startk = stepk; + } + IntParameter maxkP = new IntParameter(MAXK_ID, new GreaterConstraint(0)); + if(config.grab(maxkP)) { + maxk = maxkP.getValue(); + } + bylabel = config.tryInstantiate(ByLabelOutlier.class); + // Output + outfile = super.getParameterOutputFile(config, "File to output the resulting score vectors to."); + } + + @Override + protected AbstractApplication makeInstance() { + return new ComputeKNNOutlierScores<O, D>(verbose, inputstep, distf, startk, stepk, maxk, bylabel, outfile); + } + } + + /** + * Main method. + * + * @param args Command line parameters. + */ + public static void main(String[] args) { + ComputeKNNOutlierScores.runCLIApplication(ComputeKNNOutlierScores.class, args); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java new file mode 100644 index 00000000..cfd768c1 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java @@ -0,0 +1,439 @@ +package de.lmu.ifi.dbs.elki.application.greedyensemble; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Arrays; +import java.util.Collections; +import java.util.Set; +import java.util.TreeSet; + +import de.lmu.ifi.dbs.elki.application.AbstractApplication; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.WeightedPearsonCorrelationDistanceFunction; +import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TopBoundedHeap; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; +import de.lmu.ifi.dbs.elki.workflow.InputStep; + +/** + * Class to load an outlier detection summary file, as produced by + * {@link ComputeKNNOutlierScores}, and compute a naive ensemble for it. Based + * on this initial estimation, and optimized ensemble is built using a greedy + * strategy. Starting with the best candidate only as initial ensemble, the most + * diverse candidate is investigated at each step. If it improves towards the + * (estimated) target vector, it is added, otherwise it is discarded. + * + * This approach is naive, and it may be surprising that it can improve results. + * The reason is probably that diversity will result in a comparable ensemble, + * while the reduced ensemble size is actually responsible for the improvements, + * by being more decisive and less noisy due to dropping "unhelpful" members. + * + * This still leaves quite a bit of room for improvement. If you build upon this + * basic approach, please acknowledge our proof of concept work. + * + * Reference: + * <p> + * E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br /> + * On Evaluation of Outlier Rankings and Outlier Scores<br/> + * In Proceedings of the 12th SIAM International Conference on Data Mining + * (SDM), Anaheim, CA, 2012. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.") +public class GreedyEnsembleExperiment extends AbstractApplication { + /** + * Get static logger + */ + private static final Logging logger = Logging.getLogger(GreedyEnsembleExperiment.class); + + /** + * The data input part. + */ + private InputStep inputstep; + + /** + * Variant, where the truth vector is also updated. + */ + boolean refine_truth = false; + + /** + * Constructor. + * + * @param verbose Verbosity + * @param inputstep Input step + */ + public GreedyEnsembleExperiment(boolean verbose, InputStep inputstep) { + super(verbose); + this.inputstep = inputstep; + } + + @Override + public void run() { + // Note: the database contains the *result vectors*, not the original data + // points. + final Database database = inputstep.getDatabase(); + final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database); + final DBID firstid = labels.iterDBIDs().next(); + final String firstlabel = labels.get(firstid); + if(!firstlabel.matches("bylabel")) { + throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); + } + + // Dimensionality and reference vector + final int dim = DatabaseUtil.dimensionality(relation); + final NumberVector<?, ?> refvec = relation.get(firstid); + + // Build the positive index set for ROC AUC. + Set<Integer> positive = new TreeSet<Integer>(); + for(int d = 0; d < dim; d++) { + if(refvec.doubleValue(d + 1) > 0) { + positive.add(d); + } + } + + final int estimated_outliers = (int) (0.005 * dim); + int union_outliers = 0; + final int[] outliers_seen = new int[dim]; + // Find the top-k for each ensemble member + { + for(DBID id : relation.iterDBIDs()) { + // Skip "by label", obviously + if(firstid.equals(id)) { + continue; + } + final NumberVector<?, ?> vec = relation.get(id); + TiedTopBoundedHeap<DoubleIntPair> heap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder()); + for(int i = 0; i < dim; i++) { + heap.add(new DoubleIntPair(vec.doubleValue(i + 1), i)); + } + if(heap.size() >= 2 * estimated_outliers) { + logger.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size()); + } + for(DoubleIntPair pair : heap) { + if(outliers_seen[pair.second] == 0) { + outliers_seen[pair.second] = 1; + union_outliers += 1; + } + else { + outliers_seen[pair.second] += 1; + } + } + } + } + logger.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers"); + // Build the final weight vector. + final double[] estimated_weights = new double[dim]; + final double[] estimated_truth = new double[dim]; + updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth); + NumberVector<?, ?> estimated_truth_vec = refvec.newNumberVector(estimated_truth); + + PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> wdist = getDistanceFunction(estimated_weights); + PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> tdist = wdist; + + // Build the naive ensemble: + final double[] naiveensemble = new double[dim]; + { + for(DBID id : relation.iterDBIDs()) { + if(firstid.equals(id)) { + continue; + } + final NumberVector<?, ?> vec = relation.get(id); + for(int d = 0; d < dim; d++) { + naiveensemble[d] += vec.doubleValue(d + 1); + } + } + for(int d = 0; d < dim; d++) { + naiveensemble[d] /= (relation.size() - 1); + } + } + NumberVector<?, ?> naivevec = refvec.newNumberVector(naiveensemble); + + // Compute single AUC scores and estimations. + // Remember the method most similar to the estimation + double bestauc = 0.0; + String bestaucstr = ""; + double bestcost = Double.POSITIVE_INFINITY; + String bestcoststr = ""; + DBID bestid = null; + double bestest = Double.POSITIVE_INFINITY; + { + // Compute individual scores + for(DBID id : relation.iterDBIDs()) { + if(firstid.equals(id)) { + continue; + } + // fout.append(labels.get(id)); + final NumberVector<?, ?> vec = relation.get(id); + double auc = computeROCAUC(vec, positive, dim); + double estimated = wdist.doubleDistance(vec, estimated_truth_vec); + double cost = tdist.doubleDistance(vec, refvec); + logger.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(id)); + if(auc > bestauc) { + bestauc = auc; + bestaucstr = labels.get(id); + } + if(cost < bestcost) { + bestcost = cost; + bestcoststr = labels.get(id); + } + if(estimated < bestest) { + bestest = estimated; + bestid = id; + } + } + } + + // Initialize ensemble with "best" method + logger.verbose("Distance function: " + wdist); + logger.verbose("Initial estimation of outliers: " + union_outliers); + logger.verbose("Initializing ensemble with: " + labels.get(bestid)); + ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid); + ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs()); + enscands.remove(bestid); + enscands.remove(firstid); + final double[] greedyensemble = new double[dim]; + { + final NumberVector<?, ?> vec = relation.get(bestid); + for(int i = 0; i < dim; i++) { + greedyensemble[i] = vec.doubleValue(i + 1); + } + } + // Greedily grow the ensemble + final double[] testensemble = new double[dim]; + while(enscands.size() > 0) { + NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble); + + // Weighting factors for combining: + double s1 = ensemble.size() / (ensemble.size() + 1.); + double s2 = 1. / (ensemble.size() + 1.); + + final int heapsize = enscands.size(); + TopBoundedHeap<DoubleObjPair<DBID>> heap = new TopBoundedHeap<DoubleObjPair<DBID>>(heapsize, Collections.reverseOrder()); + for(DBID id : enscands) { + final NumberVector<?, ?> vec = relation.get(id); + double diversity = wdist.doubleDistance(vec, greedyvec); + heap.add(new DoubleObjPair<DBID>(diversity, id)); + } + while(heap.size() > 0) { + DBID bestadd = heap.poll().second; + enscands.remove(bestadd); + // Update ensemble: + final NumberVector<?, ?> vec = relation.get(bestadd); + for(int i = 0; i < dim; i++) { + testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i + 1) * s2; + } + NumberVector<?, ?> testvec = refvec.newNumberVector(testensemble); + double oldd = wdist.doubleDistance(estimated_truth_vec, greedyvec); + double newd = wdist.doubleDistance(estimated_truth_vec, testvec); + // logger.verbose("Distances: " + oldd + " vs. " + newd); + if(newd < oldd) { + System.arraycopy(testensemble, 0, greedyensemble, 0, dim); + ensemble.add(bestadd); + // logger.verbose("Growing ensemble with: " + labels.get(bestadd)); + break; // Recompute heap + } + else { + // logger.verbose("Discarding: " + labels.get(bestadd)); + if(refine_truth) { + boolean refresh = false; + // Update target vectors and weights + TiedTopBoundedHeap<DoubleIntPair> oheap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder()); + for(int i = 0; i < dim; i++) { + oheap.add(new DoubleIntPair(vec.doubleValue(i + 1), i)); + } + for(DoubleIntPair pair : oheap) { + assert (outliers_seen[pair.second] > 0); + outliers_seen[pair.second] -= 1; + if(outliers_seen[pair.second] == 0) { + union_outliers -= 1; + refresh = true; + } + } + if(refresh) { + updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth); + estimated_truth_vec = refvec.newNumberVector(estimated_truth); + } + } + } + } + } + // Build the improved ensemble: + StringBuffer greedylbl = new StringBuffer(); + { + for(DBID id : ensemble) { + if(greedylbl.length() > 0) { + greedylbl.append(" "); + } + greedylbl.append(labels.get(id)); + } + } + NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble); + logger.verbose("Estimated outliers remaining: " + union_outliers); + logger.verbose("Greedy ensemble: " + greedylbl.toString()); + + logger.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")"); + logger.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")"); + // Evaluate the naive ensemble and the "shrunk" ensemble + double naiveauc, naivecost; + { + naiveauc = computeROCAUC(naivevec, positive, dim); + naivecost = tdist.doubleDistance(naivevec, refvec); + logger.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost); + logger.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0)); + } + double greedyauc, greedycost; + { + greedyauc = computeROCAUC(greedyvec, positive, dim); + greedycost = tdist.doubleDistance(greedyvec, refvec); + logger.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost); + logger.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0)); + logger.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0)); + } + { + MeanVariance meanauc = new MeanVariance(); + MeanVariance meancost = new MeanVariance(); + HashSetModifiableDBIDs candidates = DBIDUtil.newHashSet(relation.getDBIDs()); + candidates.remove(firstid); + for(int i = 0; i < 5000; i++) { + // Build the improved ensemble: + final double[] randomensemble = new double[dim]; + { + DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long)i); + for(DBID id : random) { + assert (!firstid.equals(id)); + // logger.verbose("Using: "+labels.get(id)); + final NumberVector<?, ?> vec = relation.get(id); + for(int d = 0; d < dim; d++) { + randomensemble[d] += vec.doubleValue(d + 1); + } + } + for(int d = 0; d < dim; d++) { + randomensemble[d] /= ensemble.size(); + } + } + NumberVector<?, ?> randomvec = refvec.newNumberVector(randomensemble); + double auc = computeROCAUC(randomvec, positive, dim); + meanauc.put(auc); + double cost = tdist.doubleDistance(randomvec, refvec); + meancost.put(cost); + } + logger.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev())); + logger.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1)); + logger.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations."); + logger.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev())); + logger.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0)); + logger.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations."); + logger.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0)); + logger.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0)); + logger.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0)); + } + } + + protected void updateEstimations(final int[] outliers_seen, int union_outliers, final double[] estimated_weights, final double[] estimated_truth) { + for(int i = 0; i < outliers_seen.length; i++) { + if(outliers_seen[i] > 0) { + estimated_weights[i] = 0.5 / union_outliers; + estimated_truth[i] = 1.0; + } + else { + estimated_weights[i] = 0.5 / (outliers_seen.length - union_outliers); + estimated_truth[i] = 0.0; + } + } + } + + private PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> getDistanceFunction(double[] estimated_weights) { + // return new WeightedSquaredEuclideanDistanceFunction(estimated_weights); + // return new WeightedLPNormDistanceFunction(1.0, estimated_weights); + return new WeightedPearsonCorrelationDistanceFunction(estimated_weights); + } + + private double computeROCAUC(NumberVector<?, ?> vec, Set<Integer> positive, int dim) { + final DoubleIntPair[] scores = new DoubleIntPair[dim]; + for(int d = 0; d < dim; d++) { + scores[d] = new DoubleIntPair(vec.doubleValue(d + 1), d); + } + Arrays.sort(scores, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); + return ROC.computeAUC(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator())); + } + + double gain(double score, double ref, double optimal) { + return 1 - ((optimal - score) / (optimal - ref)); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractApplication.Parameterizer { + /** + * Data source + */ + InputStep inputstep; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Data input + inputstep = config.tryInstantiate(InputStep.class); + } + + @Override + protected AbstractApplication makeInstance() { + return new GreedyEnsembleExperiment(verbose, inputstep); + } + } + + /** + * Main method. + * + * @param args Command line parameters. + */ + public static void main(String[] args) { + GreedyEnsembleExperiment.runCLIApplication(GreedyEnsembleExperiment.class, args); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java new file mode 100644 index 00000000..2c728878 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java @@ -0,0 +1,307 @@ +package de.lmu.ifi.dbs.elki.application.greedyensemble; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.awt.image.BufferedImage; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.batik.util.SVGConstants; + +import de.lmu.ifi.dbs.elki.application.AbstractApplication; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; +import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage; +import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage.SimilarityMatrix; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; +import de.lmu.ifi.dbs.elki.utilities.scaling.LinearScaling; +import de.lmu.ifi.dbs.elki.visualization.VisualizationTask; +import de.lmu.ifi.dbs.elki.visualization.VisualizerContext; +import de.lmu.ifi.dbs.elki.visualization.VisualizerParameterizer; +import de.lmu.ifi.dbs.elki.visualization.gui.SimpleSVGViewer; +import de.lmu.ifi.dbs.elki.visualization.svg.SVGPlot; +import de.lmu.ifi.dbs.elki.visualization.visualizers.Visualization; +import de.lmu.ifi.dbs.elki.visualization.visualizers.visunproj.SimilarityMatrixVisualizer; +import de.lmu.ifi.dbs.elki.workflow.InputStep; + +/** + * Class to load an outlier detection summary file, as produced by + * {@link ComputeKNNOutlierScores}, and compute a matrix with the pairwise + * gains. It will have one column / row obtained for each combination. + * + * The gain is always computed in relation to the better of the two input + * methods. Green colors indicate the result has improved, red indicate it + * became worse. + * + * Reference: + * <p> + * E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br /> + * On Evaluation of Outlier Rankings and Outlier Scores<br/> + * In Proceedings of the 12th SIAM International Conference on Data Mining + * (SDM), Anaheim, CA, 2012. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.") +public class VisualizePairwiseGainMatrix extends AbstractApplication { + /** + * Get static logger + */ + private static final Logging logger = Logging.getLogger(VisualizePairwiseGainMatrix.class); + + /** + * The data input part. + */ + private InputStep inputstep; + + /** + * Parameterizer for visualizers + */ + private VisualizerParameterizer vispar; + + /** + * Constructor. + * + * @param verbose Verbosity + * @param inputstep Input step + * @param vispar Visualizer parameterizer + */ + public VisualizePairwiseGainMatrix(boolean verbose, InputStep inputstep, VisualizerParameterizer vispar) { + super(verbose); + this.inputstep = inputstep; + this.vispar = vispar; + } + + @Override + public void run() { + final Database database = inputstep.getDatabase(); + final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database); + final DBID firstid = labels.iterDBIDs().next(); + final String firstlabel = labels.get(firstid); + if(!firstlabel.matches("bylabel")) { + throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); + } + + // Dimensionality and reference vector + final int dim = DatabaseUtil.dimensionality(relation); + final NumberVector<?, ?> refvec = relation.get(firstid); + + // Build the truth vector + Set<Integer> pos = new TreeSet<Integer>(); + final DoubleIntPair[] combined = new DoubleIntPair[dim]; + { + for(int d = 0; d < dim; d++) { + combined[d] = new DoubleIntPair(0, d); + if(refvec.doubleValue(d + 1) > 0) { + pos.add(d); + } + } + } + + ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs()); + ids.remove(firstid); + final int size = ids.size(); + + double[][] data = new double[size][size]; + DoubleMinMax minmax = new DoubleMinMax(); + + for(int a = 0; a < size; a++) { + final NumberVector<?, ?> veca = relation.get(ids.get(a)); + // Direct AUC score: + { + for(int d = 0; d < dim; d++) { + combined[d].first = veca.doubleValue(d + 1); + combined[d].second = d; + } + Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); + double auc = ROC.computeAUC(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); + data[a][a] = auc; + // minmax.put(auc); + // logger.verbose(auc + " " + labels.get(ids.get(a))); + } + // Compare to others, exploiting symmetry + for(int b = a + 1; b < size; b++) { + final NumberVector<?, ?> vecb = relation.get(ids.get(b)); + for(int d = 0; d < dim; d++) { + combined[d].first = veca.doubleValue(d + 1) + vecb.doubleValue(d + 1); + combined[d].second = d; + } + Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); + double auc = ROC.computeAUC(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); + // logger.verbose(auc + " " + labels.get(ids.get(a)) + " " + + // labels.get(ids.get(b))); + data[a][b] = auc; + data[b][a] = auc; + // minmax.put(auc); + } + } + for(int a = 0; a < size; a++) { + for(int b = a + 1; b < size; b++) { + double ref = Math.max(data[a][a], data[b][b]); + data[a][b] = (data[a][b] - ref) / (1 - ref); + data[b][a] = (data[b][a] - ref) / (1 - ref); + // logger.verbose(data[a][b] + " " + labels.get(ids.get(a)) + " " + + // labels.get(ids.get(b))); + minmax.put(data[a][b]); + } + } + for(int a = 0; a < size; a++) { + data[a][a] = 0; + } + + logger.verbose(minmax.toString()); + + boolean hasneg = (minmax.getMin() < -1E-3); + LinearScaling scale; + if(!hasneg) { + scale = new LinearScaling(minmax); + } + else { + scale = LinearScaling.fromMinMax(0.0, Math.max(minmax.getMax(), -minmax.getMin())); + } + + BufferedImage img = new BufferedImage(size, size, BufferedImage.TYPE_INT_RGB); + for(int x = 0; x < size; x++) { + for(int y = x; y < size; y++) { + double val = data[x][y]; + val = scale.getScaled(val); + // Compute color: + final int col; + { + if(!hasneg) { + int ival = 0xFF & (int) (255 * Math.max(0, val)); + col = 0xff000000 | (ival << 16) | (ival << 8) | ival; + } + else { + if(val >= 0) { + int ival = 0xFF & (int) (255 * val); + col = 0xff000000 | (ival << 8); + } + else { + int ival = 0xFF & (int) (255 * -val); + col = 0xff000000 | (ival << 16); + } + } + } + img.setRGB(x, y, col); + img.setRGB(y, x, col); + } + } + SimilarityMatrix smat = new ComputeSimilarityMatrixImage.SimilarityMatrix(img, relation, ids); + database.getHierarchy().add(database, smat); + + VisualizerContext context = vispar.newContext(database); + + // Attach visualizers to results + SimilarityMatrixVisualizer.Factory factory = new SimilarityMatrixVisualizer.Factory(); + factory.processNewResult(database, database); + + List<VisualizationTask> tasks = ResultUtil.filterResults(database, VisualizationTask.class); + for(VisualizationTask task : tasks) { + if(task.getFactory() == factory) { + showVisualization(context, factory, task); + } + } + } + + /** + * Show a single visualization. + * + * @param context + * + * @param factory + * @param task + */ + private void showVisualization(VisualizerContext context, SimilarityMatrixVisualizer.Factory factory, VisualizationTask task) { + SVGPlot plot = new SVGPlot(); + Visualization vis = factory.makeVisualization(task.clone(plot, context, null, 1.0, 1.0)); + plot.getRoot().appendChild(vis.getLayer()); + plot.getRoot().setAttribute(SVGConstants.SVG_WIDTH_ATTRIBUTE, "20cm"); + plot.getRoot().setAttribute(SVGConstants.SVG_HEIGHT_ATTRIBUTE, "20cm"); + plot.getRoot().setAttribute(SVGConstants.SVG_VIEW_BOX_ATTRIBUTE, "0 0 1 1"); + plot.updateStyleElement(); + + (new SimpleSVGViewer()).setPlot(plot); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractApplication.Parameterizer { + /** + * Data source + */ + InputStep inputstep; + + /** + * Parameterizer for visualizers + */ + private VisualizerParameterizer vispar; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Data input + inputstep = config.tryInstantiate(InputStep.class); + // Visualization options + vispar = config.tryInstantiate(VisualizerParameterizer.class); + } + + @Override + protected AbstractApplication makeInstance() { + return new VisualizePairwiseGainMatrix(verbose, inputstep, vispar); + } + } + + /** + * Main method. + * + * @param args Command line parameters. + */ + public static void main(String[] args) { + VisualizePairwiseGainMatrix.runCLIApplication(VisualizePairwiseGainMatrix.class, args); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/package-info.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/package-info.java new file mode 100644 index 00000000..29b34e96 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/package-info.java @@ -0,0 +1,32 @@ +/** + * <p>Greedy ensembles for outlier detection.</p> + * + * <p>This package contains code that was used for the greedy ensemble experiment in + * <blockquote> E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel<br /> + * On Evaluation of Outlier Rankings and Outlier Scores<br/> + * In Proceedings of the 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.</blockquote> + * </p> + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.application.greedyensemble;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIProperties.java b/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIServices.java index fa260564..21ed047b 100644 --- a/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIProperties.java +++ b/src/de/lmu/ifi/dbs/elki/application/internal/CheckELKIServices.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.internal; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,36 +23,44 @@ package de.lmu.ifi.dbs.elki.application.internal; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.properties.Properties; +import de.lmu.ifi.dbs.elki.utilities.ELKIServiceLoader; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.InspectionUtil; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; /** - * Helper application to test the ELKI properties file for "missing" implementations. + * Helper application to test the ELKI properties file for "missing" + * implementations. * * @author Erich Schubert * - * @apiviz.uses Properties + * @apiviz.uses ELKIServiceLoader */ -public class CheckELKIProperties { +public class CheckELKIServices { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(CheckELKIProperties.class); - + private static final Logging logger = Logging.getLogger(CheckELKIServices.class); + /** * Pattern to strip comments, while keeping commented class names. */ - private Pattern strip = Pattern.compile("^[\\s#]*(.*?)[\\s,]*$"); + private Pattern strip = Pattern.compile("^[\\s#]*(.*?)[\\s]*$"); /** * Package to skip matches in - unreleased code. @@ -65,25 +73,36 @@ public class CheckELKIProperties { * @param argv Command line arguments */ public static void main(String[] argv) { - new CheckELKIProperties().checkProperties(); + new CheckELKIServices().checkServices(); } - + /** * Retrieve all properties and check them. */ - public void checkProperties() { - Set<String> props = Properties.ELKI_PROPERTIES.getPropertyNames(); - for(String prop : props) { - checkProperty(prop); + public void checkServices() { + URL u = getClass().getClassLoader().getResource(ELKIServiceLoader.PREFIX); + try { + for(String prop : new File(u.toURI()).list()) { + if (".svn".equals(prop)) { + continue; + } + if (logger.isVerbose()) { + logger.verbose("Checking property: "+prop); + } + checkService(prop); + } + } + catch(URISyntaxException e) { + throw new AbortException("Cannot check all properties, as some are not in a file: URL."); } } /** - * Check a single property + * Check a single service class * - * @param prop Property = Class name. + * @param prop Class name. */ - private void checkProperty(String prop) { + private void checkService(String prop) { Class<?> cls; try { cls = Class.forName(prop); @@ -108,23 +127,34 @@ public class CheckELKIProperties { names.add(c2.getName()); } - String[] known = Properties.ELKI_PROPERTIES.getProperty(cls.getName()); - for(String k : known) { - Matcher m = strip.matcher(k); - if(m.matches()) { - String stripped = m.group(1); - if(stripped.length() > 0) { - if(names.contains(stripped)) { - names.remove(stripped); - } - else { - logger.warning("Name " + stripped + " found for property " + prop + " but no class discovered (or referenced twice?)."); + try { + InputStream is = getClass().getClassLoader().getResource(ELKIServiceLoader.PREFIX + cls.getName()).openStream(); + BufferedReader r = new BufferedReader(new InputStreamReader(is, "utf-8")); + for(String line;;) { + line = r.readLine(); + // End of stream: + if(line == null) { + break; + } + Matcher m = strip.matcher(line); + if(m.matches()) { + String stripped = m.group(1); + if(stripped.length() > 0) { + if(names.contains(stripped)) { + names.remove(stripped); + } + else { + logger.warning("Name " + stripped + " found for property " + prop + " but no class discovered (or referenced twice?)."); + } } } + else { + logger.warning("Line: " + line + " didn't match regexp."); + } } - else { - logger.warning("Line: " + k + " didn't match regexp."); - } + } + catch(IOException e) { + logger.exception(e); } if(names.size() > 0) { StringBuffer message = new StringBuffer(); @@ -134,7 +164,7 @@ public class CheckELKIProperties { // TODO: sort by package, then classname Collections.sort(sorted); for(String remaining : sorted) { - message.append("# " + remaining + ",\\" + FormatUtil.NEWLINE); + message.append("# " + remaining + FormatUtil.NEWLINE); } logger.warning(message.toString()); } diff --git a/src/de/lmu/ifi/dbs/elki/application/internal/CheckParameterizables.java b/src/de/lmu/ifi/dbs/elki/application/internal/CheckParameterizables.java index 017f2d12..b94ee47a 100644 --- a/src/de/lmu/ifi/dbs/elki/application/internal/CheckParameterizables.java +++ b/src/de/lmu/ifi/dbs/elki/application/internal/CheckParameterizables.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.internal; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/internal/DocumentParameters.java b/src/de/lmu/ifi/dbs/elki/application/internal/DocumentParameters.java index 05af4b4d..c77e9dbb 100644 --- a/src/de/lmu/ifi/dbs/elki/application/internal/DocumentParameters.java +++ b/src/de/lmu/ifi/dbs/elki/application/internal/DocumentParameters.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.internal; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -33,6 +33,7 @@ import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.Iterator; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -50,11 +51,10 @@ import org.w3c.dom.Document; import org.w3c.dom.Element; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.properties.Properties; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; +import de.lmu.ifi.dbs.elki.utilities.ELKIServiceLoader; import de.lmu.ifi.dbs.elki.utilities.InspectionUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.HashMapList; -import de.lmu.ifi.dbs.elki.utilities.iterator.IterableIterator; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable; import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizer; @@ -635,8 +635,8 @@ public class DocumentParameters { private static void appendKnownImplementationsIfNonempty(Document htmldoc, ClassParameter<?> opt, Element elemdd) { if(opt.getRestrictionClass() != Object.class) { - IterableIterator<Class<?>> iter = opt.getKnownImplementations(); - if(iter.hasNext()) { + List<Class<?>> iter = opt.getKnownImplementations(); + if(!iter.isEmpty()) { Element p = htmldoc.createElement(HTMLUtil.HTML_P_TAG); p.appendChild(htmldoc.createTextNode(HEADER_KNOWN_IMPLEMENTATIONS)); elemdd.appendChild(p); @@ -652,7 +652,8 @@ public class DocumentParameters { elemdd.appendChild(ul); } // Report when not in properties file. - if(Properties.ELKI_PROPERTIES.getProperty(opt.getRestrictionClass().getName()).length == 0) { + Iterator<Class<?>> clss = new ELKIServiceLoader(opt.getRestrictionClass()); + if (!clss.hasNext()) { logger.warning(opt.getRestrictionClass().getName() + " not in properties. No autocompletion available in release GUI."); } } diff --git a/src/de/lmu/ifi/dbs/elki/application/internal/DocumentReferences.java b/src/de/lmu/ifi/dbs/elki/application/internal/DocumentReferences.java index 949a7874..2e002243 100644 --- a/src/de/lmu/ifi/dbs/elki/application/internal/DocumentReferences.java +++ b/src/de/lmu/ifi/dbs/elki/application/internal/DocumentReferences.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.internal; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,12 +25,12 @@ package de.lmu.ifi.dbs.elki.application.internal; import java.io.BufferedOutputStream; import java.io.File; -import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.io.PrintStream; +import java.lang.reflect.Method; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -44,9 +44,9 @@ import org.w3c.dom.DOMImplementation; import org.w3c.dom.Document; import org.w3c.dom.Element; +import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.utilities.InspectionUtil; -import de.lmu.ifi.dbs.elki.utilities.documentation.DocumentationUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; import de.lmu.ifi.dbs.elki.utilities.xml.HTMLUtil; @@ -64,46 +64,56 @@ public class DocumentReferences { private static final String MODIFICATION_WARNING = "WARNING: THIS DOCUMENT IS AUTOMATICALLY GENERATED. MODIFICATIONS MAY GET LOST."; /** + * Logger + */ + private static final Logging logger = Logging.getLogger(DocumentReferences.class); + + /** * @param args Command line arguments */ public static void main(String[] args) { - if(args.length != 1) { - LoggingUtil.warning("I need exactly one file name to operate!"); + if(args.length < 1 || args.length > 2) { + LoggingUtil.warning("I need exactly one or two file names to operate!"); System.exit(1); } - if(!args[0].endsWith(".html")) { - LoggingUtil.warning("File name doesn't end with .html!"); + if(!args[0].endsWith(".html") || (args.length > 1 && !args[1].endsWith(".wiki"))) { + LoggingUtil.warning("File name doesn't end in expected extension!"); System.exit(1); } - File references = new File(args[0]); - { - FileOutputStream reffo; - try { - reffo = new FileOutputStream(references); - } - catch(FileNotFoundException e) { - LoggingUtil.exception("Can't create output stream!", e); - throw new RuntimeException(e); - } + List<Pair<Reference, List<Class<?>>>> refs = sortedReferences(); + try { + File references = new File(args[0]); + FileOutputStream reffo = new FileOutputStream(references); + Document refdoc = documentReferences(refs); OutputStream refstream = new BufferedOutputStream(reffo); - Document refdoc = documentParameters(); + HTMLUtil.writeXHTML(refdoc, refstream); + refstream.flush(); + refstream.close(); + reffo.close(); + } + catch(IOException e) { + LoggingUtil.exception("IO Exception writing HTML output.", e); + throw new RuntimeException(e); + } + if(args.length > 1) { try { - HTMLUtil.writeXHTML(refdoc, refstream); - refstream.flush(); - refstream.close(); - reffo.close(); + File refwiki = new File(args[1]); + FileOutputStream reffow = new FileOutputStream(refwiki); + PrintStream refstreamW = new PrintStream(reffow); + documentReferencesWiki(refs, refstreamW); + refstreamW.flush(); + refstreamW.close(); + reffow.close(); } catch(IOException e) { - LoggingUtil.exception("IO Exception writing output.", e); + LoggingUtil.exception("IO Exception writing Wiki output.", e); throw new RuntimeException(e); } } } - private static Document documentParameters() { - List<Pair<Reference, List<Class<?>>>> refs = sortedReferences(); - + private static Document documentReferences(List<Pair<Reference, List<Class<?>>>> refs) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder; try { @@ -165,7 +175,7 @@ public class DocumentReferences { { boolean first = true; for(Class<?> cls : pair.second) { - if (!first) { + if(!first) { classdt.appendChild(htmldoc.createTextNode(", ")); } Element classan = htmldoc.createElement(HTMLUtil.HTML_A_TAG); @@ -177,7 +187,7 @@ public class DocumentReferences { classa.setAttribute(HTMLUtil.HTML_HREF_ATTRIBUTE, linkForClassName(cls.getName())); classa.setTextContent(cls.getName()); classdt.appendChild(classa); - + first = false; } } @@ -222,25 +232,98 @@ public class DocumentReferences { return htmldoc; } - private static List<Pair<Reference, List<Class<?>>>> sortedReferences() { - ArrayList<Class<?>> classes = findAllClassesWithReferences(); - Collections.sort(classes, new InspectionUtil.ClassSorter()); + private static void documentReferencesWiki(List<Pair<Reference, List<Class<?>>>> refs, PrintStream refstreamW) { + for(Pair<Reference, List<Class<?>>> pair : refs) { + // JavaDoc links for relevant classes. + { + boolean first = true; + for(Class<?> cls : pair.second) { + if(!first) { + refstreamW.println(",[[br]]"); + } + refstreamW.print("[[javadoc("); + refstreamW.print(cls.getName()); + refstreamW.print(","); + refstreamW.print(cls.getName()); + refstreamW.print(")]]"); + + first = false; + } + } + refstreamW.println(""); - List<Pair<Reference, List<Class<?>>>> refs = new ArrayList<Pair<Reference, List<Class<?>>>>(classes.size()); - Map<Reference, List<Class<?>>> map = new HashMap<Reference, List<Class<?>>>(classes.size()); - for(Class<?> cls : classes) { - Reference ref = DocumentationUtil.getReference(cls); - List<Class<?>> list = map.get(ref); - if(list == null) { - list = new ArrayList<Class<?>>(5); - map.put(ref, list); - refs.add(new Pair<Reference, List<Class<?>>>(ref, list)); + String indent = " "; + { + Reference ref = pair.first; + // Prefix + if(ref.prefix().length() > 0) { + refstreamW.println(indent + ref.prefix() + " [[br]]"); + } + // Authors + refstreamW.println(indent + "By: " + ref.authors() + " [[br]]"); + // Title + refstreamW.println(indent + "'''" + ref.title() + "'''" + " [[br]]"); + // Booktitle + refstreamW.println(indent + "In: " + ref.booktitle() + " [[br]]"); + // URL + if(ref.url().length() > 0) { + refstreamW.println(indent + "Online: [" + ref.url() + "][[br]]"); + } } - list.add(cls); + refstreamW.println(""); + refstreamW.println(""); + } + } + + private static List<Pair<Reference, List<Class<?>>>> sortedReferences() { + List<Pair<Reference, List<Class<?>>>> refs = new ArrayList<Pair<Reference, List<Class<?>>>>(); + Map<Reference, List<Class<?>>> map = new HashMap<Reference, List<Class<?>>>(); + + for(final Class<?> cls : InspectionUtil.findAllImplementations(Object.class, true)) { + inspectClass(cls, refs, map); } return refs; } + private static void inspectClass(final Class<?> cls, List<Pair<Reference, List<Class<?>>>> refs, Map<Reference, List<Class<?>>> map) { + try { + if(cls.isAnnotationPresent(Reference.class)) { + Reference ref = cls.getAnnotation(Reference.class); + List<Class<?>> list = map.get(ref); + if(list == null) { + list = new ArrayList<Class<?>>(5); + map.put(ref, list); + refs.add(new Pair<Reference, List<Class<?>>>(ref, list)); + } + list.add(cls); + } + // Inner classes + for(Class<?> c2 : cls.getDeclaredClasses()) { + inspectClass(c2, refs, map); + } + for(Method m : cls.getDeclaredMethods()) { + if(m.isAnnotationPresent(Reference.class)) { + Reference ref = m.getAnnotation(Reference.class); + List<Class<?>> list = map.get(ref); + if(list == null) { + list = new ArrayList<Class<?>>(5); + map.put(ref, list); + refs.add(new Pair<Reference, List<Class<?>>>(ref, list)); + } + list.add(cls); + } + } + } + catch(NoClassDefFoundError e) { + if(!cls.getCanonicalName().startsWith("experimentalcode.")) { + logger.warning("Exception in finding references for class " + cls.getCanonicalName() + " - missing referenced class?"); + } + } + catch(Error e) { + logger.warning("Exception in finding references for class " + cls.getCanonicalName() + ": " + e, e); + } + } + private static String linkForClassName(String name) { String link = name.replace(".", "/") + ".html"; return link; @@ -253,10 +336,17 @@ public class DocumentReferences { */ public static ArrayList<Class<?>> findAllClassesWithReferences() { ArrayList<Class<?>> references = new ArrayList<Class<?>>(); - for(final Class<?> cls : InspectionUtil.findAllImplementations(Object.class, false)) { + for(final Class<?> cls : InspectionUtil.findAllImplementations(Object.class, true)) { if(cls.isAnnotationPresent(Reference.class)) { references.add(cls); } + else { + for(Method m : cls.getDeclaredMethods()) { + if(m.isAnnotationPresent(Reference.class)) { + references.add(cls); + } + } + } } return references; } diff --git a/src/de/lmu/ifi/dbs/elki/application/internal/package-info.java b/src/de/lmu/ifi/dbs/elki/application/internal/package-info.java index dd7746e8..56326655 100644 --- a/src/de/lmu/ifi/dbs/elki/application/internal/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/application/internal/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONBuffer.java b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONBuffer.java index 9ef3f688..c077e9d3 100644 --- a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONBuffer.java +++ b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONBuffer.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.jsmap; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONResultHandler.java b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONResultHandler.java index 96a24dcb..7fc26496 100644 --- a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONResultHandler.java +++ b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONResultHandler.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.jsmap; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java index 873420fd..5b210795 100644 --- a/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java +++ b/src/de/lmu/ifi/dbs/elki/application/jsmap/JSONWebServer.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.jsmap; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/jsmap/package-info.java b/src/de/lmu/ifi/dbs/elki/application/jsmap/package-info.java index f359198a..e89e272c 100644 --- a/src/de/lmu/ifi/dbs/elki/application/jsmap/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/application/jsmap/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/package-info.java b/src/de/lmu/ifi/dbs/elki/application/package-info.java index b81ac8f5..35b3b3d9 100644 --- a/src/de/lmu/ifi/dbs/elki/application/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/application/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java b/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java index 0cb950af..0419f791 100644 --- a/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java +++ b/src/de/lmu/ifi/dbs/elki/application/visualization/KNNExplorer.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.application.visualization; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2011 + Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,8 +29,6 @@ import java.awt.Component; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.util.HashMap; -import java.util.List; -import java.util.ListIterator; import javax.swing.DefaultListCellRenderer; import javax.swing.DefaultListModel; @@ -62,12 +60,14 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.scales.LinearScale; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; @@ -79,7 +79,6 @@ import de.lmu.ifi.dbs.elki.visualization.batikutil.LazyCanvasResizer; import de.lmu.ifi.dbs.elki.visualization.batikutil.NodeReplacer; import de.lmu.ifi.dbs.elki.visualization.css.CSSClassManager.CSSNamingConflict; import de.lmu.ifi.dbs.elki.visualization.savedialog.SVGSaveDialog; -import de.lmu.ifi.dbs.elki.visualization.scales.LinearScale; import de.lmu.ifi.dbs.elki.visualization.style.PropertiesBasedStyleLibrary; import de.lmu.ifi.dbs.elki.visualization.style.StyleLibrary; import de.lmu.ifi.dbs.elki.visualization.svg.SVGPath; @@ -393,7 +392,7 @@ public class KNNExplorer<O extends NumberVector<?, ?>, D extends NumberDistance< try { StyleLibrary style = new PropertiesBasedStyleLibrary(); - SVGSimpleLinearAxis.drawAxis(plot, viewport, this.s, 0.0, StyleLibrary.SCALE, 0.0, 0.0, true, false, style); + SVGSimpleLinearAxis.drawAxis(plot, viewport, this.s, 0.0, StyleLibrary.SCALE, 0.0, 0.0, SVGSimpleLinearAxis.LabelStyle.LEFTHAND, style); } catch(CSSNamingConflict e) { LoggingUtil.exception(e); @@ -436,21 +435,21 @@ public class KNNExplorer<O extends NumberVector<?, ?>, D extends NumberDistance< for(Object o : sel) { DBID idx = (DBID) o; - List<DistanceResultPair<D>> knn = knnQuery.getKNNForDBID(idx, k); + KNNResult<D> knn = knnQuery.getKNNForDBID(idx, k); - double maxdist = knn.get(knn.size() - 1).getDistance().doubleValue(); + double maxdist = knn.getKNNDistance().doubleValue(); // avoid division by zero. if(maxdist == 0) { maxdist = 1; } - for(ListIterator<DistanceResultPair<D>> iter = knn.listIterator(knn.size()); iter.hasPrevious();) { - DistanceResultPair<D> pair = iter.previous(); + for (int i = knn.size() - 1; i >= 0; i--) { + DistanceResultPair<D> pair = knn.get(i); Element line = plotSeries(pair.getDBID(), MAXRESOLUTION); double dist = pair.getDistance().doubleValue() / maxdist; Color color = getColor(dist); String colstr = "#" + Integer.toHexString(color.getRGB()).substring(2); - String width = (pair.getDBID() == idx) ? "0.5%" : "0.2%"; + String width = (pair.getDBID().equals(idx)) ? "0.5%" : "0.2%"; SVGUtil.setStyle(line, "stroke: " + colstr + "; stroke-width: " + width + "; fill: none"); newe.appendChild(line); // put into cache diff --git a/src/de/lmu/ifi/dbs/elki/application/visualization/package-info.java b/src/de/lmu/ifi/dbs/elki/application/visualization/package-info.java index 70870cfd..717a80f9 100644 --- a/src/de/lmu/ifi/dbs/elki/application/visualization/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/application/visualization/package-info.java @@ -6,7 +6,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2011 +Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |