From cde76aeb42240f7270bc6605c606ae07d2dc5a7d Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Sat, 9 Mar 2019 22:30:28 +0000 Subject: Import Upstream version 0.4.0~beta1 --- .../statistics/DistanceStatisticsWithClasses.java | 447 +++++++++++++++++++++ .../statistics/EvaluateRankingQuality.java | 219 ++++++++++ .../statistics/RankingQualityHistogram.java | 183 +++++++++ .../elki/algorithm/statistics/package-info.java | 29 ++ 4 files changed, 878 insertions(+) create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java create mode 100644 src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/statistics') diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java new file mode 100644 index 00000000..97f6b9cd --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java @@ -0,0 +1,447 @@ +package de.lmu.ifi.dbs.elki.algorithm.statistics; +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2011 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.Random; +import java.util.TreeSet; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.AggregatingHistogram; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.FlexiHistogram; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.CollectionResult; +import de.lmu.ifi.dbs.elki.result.HistogramResult; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OnlyOneIsAllowedToBeSetGlobalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Algorithm to gather statistics over the distance distribution in the data + * set. + * + * @author Erich Schubert + * @param Object type + * @param Distance type + */ +// TODO: optimize for double distances. +@Title("Distance Histogram") +@Description("Computes a histogram over the distances occurring in the data set.") +public class DistanceStatisticsWithClasses> extends AbstractDistanceBasedAlgorithm> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(DistanceStatisticsWithClasses.class); + + /** + * Flag to compute exact value range for binning. + */ + public static final OptionID EXACT_ID = OptionID.getOrCreateOptionID("diststat.exact", "In a first pass, compute the exact minimum and maximum, at the cost of O(2*n*n) instead of O(n*n). The number of resulting bins is guaranteed to be as requested."); + + /** + * Flag to enable sampling + */ + public static final OptionID SAMPLING_ID = OptionID.getOrCreateOptionID("diststat.sampling", "Enable sampling of O(n) size to determine the minimum and maximum distances approximately. The resulting number of bins can be larger than the given n."); + + /** + * Option to configure the number of bins to use. + */ + public static final OptionID HISTOGRAM_BINS_ID = OptionID.getOrCreateOptionID("diststat.bins", "Number of bins to use in the histogram. By default, it is only guaranteed to be within 1*n and 2*n of the given number."); + + /** + * Number of bins to use in sampling. + */ + private int numbin; + + /** + * Sampling + */ + private boolean sampling = false; + + /** + * Sampling + */ + private boolean exact = false; + + /** + * Constructor. + * + * @param distanceFunction Distance function to use + * @param numbins Number of bins + * @param exact Exactness flag + * @param sampling Sampling flag + */ + public DistanceStatisticsWithClasses(DistanceFunction distanceFunction, int numbins, boolean exact, boolean sampling) { + super(distanceFunction); + this.numbin = numbins; + this.exact = exact; + this.sampling = sampling; + } + + /** + * Iterates over all points in the database. + */ + @Override + public HistogramResult run(Database database) throws IllegalStateException { + final Relation relation = database.getRelation(getInputTypeRestriction()[0]); + final DistanceQuery distFunc = database.getDistanceQuery(relation, getDistanceFunction()); + + final StepProgress stepprog = logger.isVerbose() ? new StepProgress("Distance statistics", 2) : null; + + // determine binning ranges. + DoubleMinMax gminmax = new DoubleMinMax(); + + // Cluster by labels + Collection> split = (new ByLabelClustering()).run(database).getAllClusters(); + + // global in-cluster min/max + DoubleMinMax giminmax = new DoubleMinMax(); + // global other-cluster min/max + DoubleMinMax gominmax = new DoubleMinMax(); + // in-cluster distances + MeanVariance mimin = new MeanVariance(); + MeanVariance mimax = new MeanVariance(); + MeanVariance midif = new MeanVariance(); + // other-cluster distances + MeanVariance momin = new MeanVariance(); + MeanVariance momax = new MeanVariance(); + MeanVariance modif = new MeanVariance(); + // Histogram + final AggregatingHistogram, Pair> histogram; + if(stepprog != null) { + stepprog.beginStep(1, "Prepare histogram.", logger); + } + if(exact) { + gminmax = exactMinMax(relation, distFunc); + histogram = AggregatingHistogram.LongSumLongSumHistogram(numbin, gminmax.getMin(), gminmax.getMax()); + } + else if(sampling) { + gminmax = sampleMinMax(relation, distFunc); + histogram = AggregatingHistogram.LongSumLongSumHistogram(numbin, gminmax.getMin(), gminmax.getMax()); + } + else { + histogram = FlexiHistogram.LongSumLongSumHistogram(numbin); + } + + if(stepprog != null) { + stepprog.beginStep(2, "Build histogram.", logger); + } + final FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), logger) : null; + // iterate per cluster + final Pair incFirst = new Pair(1L, 0L); + final Pair incSecond = new Pair(0L, 1L); + for(Cluster c1 : split) { + for(DBID id1 : c1.getIDs()) { + // in-cluster distances + DoubleMinMax iminmax = new DoubleMinMax(); + for(DBID id2 : c1.getIDs()) { + // skip the point itself. + if(id1 == id2) { + continue; + } + double d = distFunc.distance(id1, id2).doubleValue(); + + histogram.aggregate(d, incFirst); + + iminmax.put(d); + } + // aggregate + mimin.put(iminmax.getMin()); + mimax.put(iminmax.getMax()); + midif.put(iminmax.getDiff()); + // min/max + giminmax.put(iminmax.getMin()); + giminmax.put(iminmax.getMax()); + + // other-cluster distances + DoubleMinMax ominmax = new DoubleMinMax(); + for(Cluster c2 : split) { + if(c2 == c1) { + continue; + } + for(DBID id2 : c2.getIDs()) { + // skip the point itself (shouldn't happen though) + if(id1 == id2) { + continue; + } + double d = distFunc.distance(id1, id2).doubleValue(); + + histogram.aggregate(d, incSecond); + + ominmax.put(d); + } + } + // aggregate + momin.put(ominmax.getMin()); + momax.put(ominmax.getMax()); + modif.put(ominmax.getDiff()); + // min/max + gominmax.put(ominmax.getMin()); + gominmax.put(ominmax.getMax()); + if(progress != null) { + progress.incrementProcessed(logger); + } + } + } + if(progress != null) { + progress.ensureCompleted(logger); + } + // Update values (only needed for sampling case). + gminmax.setFirst(Math.min(giminmax.getMin(), gominmax.getMin())); + gminmax.setSecond(Math.max(giminmax.getMax(), gominmax.getMax())); + + if(stepprog != null) { + stepprog.setCompleted(logger); + } + + // count the number of samples we have in the data + long inum = 0; + long onum = 0; + for(Pair> ppair : histogram) { + inum += ppair.getSecond().getFirst(); + onum += ppair.getSecond().getSecond(); + } + long bnum = inum + onum; + // Note: when full sampling is added, this assertion won't hold anymore. + assert (bnum == relation.size() * (relation.size() - 1)); + + Collection binstat = new ArrayList(numbin); + for(Pair> ppair : histogram) { + final double icof = (inum == 0) ? 0 : ((double) ppair.getSecond().getFirst()) / inum / histogram.getBinsize(); + final double icaf = ((double) ppair.getSecond().getFirst()) / bnum / histogram.getBinsize(); + final double ocof = (onum == 0) ? 0 : ((double) ppair.getSecond().getSecond()) / onum / histogram.getBinsize(); + final double ocaf = ((double) ppair.getSecond().getSecond()) / bnum / histogram.getBinsize(); + DoubleVector row = new DoubleVector(new double[] { ppair.getFirst(), icof, icaf, ocof, ocaf }); + binstat.add(row); + } + HistogramResult result = new HistogramResult("Distance Histogram", "distance-histogram", binstat); + + result.addHeader("Absolute minimum distance (abs): " + gminmax.getMin()); + result.addHeader("Absolute maximum distance (abs): " + gminmax.getMax()); + result.addHeader("In-Cluster minimum distance (abs, avg, stddev): " + giminmax.getMin() + " " + mimin.getMean() + " " + mimin.getSampleStddev()); + result.addHeader("In-Cluster maximum distance (abs, avg, stddev): " + giminmax.getMax() + " " + mimax.getMean() + " " + mimax.getSampleStddev()); + result.addHeader("Other-Cluster minimum distance (abs, avg, stddev): " + gominmax.getMin() + " " + momin.getMean() + " " + momin.getSampleStddev()); + result.addHeader("Other-Cluster maximum distance (abs, avg, stddev): " + gominmax.getMax() + " " + momax.getMean() + " " + momax.getSampleStddev()); + result.addHeader("Column description: bin center, in-cluster only frequency, in-cluster all frequency, other-cluster only frequency, other cluster all frequency"); + result.addHeader("In-cluster value count: " + inum + " other cluster value count: " + onum); + return result; + } + + private DoubleMinMax sampleMinMax(Relation database, DistanceQuery distFunc) { + int size = database.size(); + Random rnd = new Random(); + // estimate minimum and maximum. + int k = (int) Math.max(25, Math.pow(database.size(), 0.2)); + TreeSet> minhotset = new TreeSet>(); + TreeSet> maxhotset = new TreeSet>(Collections.reverseOrder()); + + int randomsize = (int) Math.max(25, Math.pow(database.size(), 0.2)); + double rprob = ((double) randomsize) / size; + ArrayModifiableDBIDs randomset = DBIDUtil.newArray(randomsize); + + Iterator iter = database.iterDBIDs(); + if(!iter.hasNext()) { + throw new IllegalStateException(ExceptionMessages.DATABASE_EMPTY); + } + DBID firstid = iter.next(); + minhotset.add(new FCPair(Double.MAX_VALUE, firstid)); + maxhotset.add(new FCPair(Double.MIN_VALUE, firstid)); + while(iter.hasNext()) { + DBID id1 = iter.next(); + // generate candidates for min distance. + ArrayList> np = new ArrayList>(k * 2 + randomsize * 2); + for(FCPair pair : minhotset) { + DBID id2 = pair.getSecond(); + // skip the object itself + if(id1.compareTo(id2) == 0) { + continue; + } + double d = distFunc.distance(id1, id2).doubleValue(); + np.add(new FCPair(d, id1)); + np.add(new FCPair(d, id2)); + } + for(DBID id2 : randomset) { + double d = distFunc.distance(id1, id2).doubleValue(); + np.add(new FCPair(d, id1)); + np.add(new FCPair(d, id2)); + } + minhotset.addAll(np); + shrinkHeap(minhotset, k); + + // generate candidates for max distance. + ArrayList> np2 = new ArrayList>(k * 2 + randomsize * 2); + for(FCPair pair : minhotset) { + DBID id2 = pair.getSecond(); + // skip the object itself + if(id1.compareTo(id2) == 0) { + continue; + } + double d = distFunc.distance(id1, id2).doubleValue(); + np2.add(new FCPair(d, id1)); + np2.add(new FCPair(d, id2)); + } + for(DBID id2 : randomset) { + double d = distFunc.distance(id1, id2).doubleValue(); + np.add(new FCPair(d, id1)); + np.add(new FCPair(d, id2)); + } + maxhotset.addAll(np2); + shrinkHeap(maxhotset, k); + + // update random set + if(randomset.size() < randomsize) { + randomset.add(id1); + } + else if(rnd.nextDouble() < rprob) { + randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), id1); + } + } + return new DoubleMinMax(minhotset.first().getFirst(), maxhotset.first().getFirst()); + } + + private DoubleMinMax exactMinMax(Relation database, DistanceQuery distFunc) { + DoubleMinMax minmax = new DoubleMinMax(); + // find exact minimum and maximum first. + for(DBID id1 : database.iterDBIDs()) { + for(DBID id2 : database.iterDBIDs()) { + // skip the point itself. + if(id1.compareTo(id2) == 0) { + continue; + } + double d = distFunc.distance(id1, id2).doubleValue(); + minmax.put(d); + } + } + return minmax; + } + + private void shrinkHeap(TreeSet> hotset, int k) { + // drop duplicates + ModifiableDBIDs seenids = DBIDUtil.newHashSet(2 * k); + int cnt = 0; + for(Iterator> i = hotset.iterator(); i.hasNext();) { + FCPair p = i.next(); + if(cnt > k || seenids.contains(p.getSecond())) { + i.remove(); + } + else { + seenids.add(p.getSecond()); + cnt++; + } + } + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractDistanceBasedAlgorithm.Parameterizer { + /** + * Number of bins to use in sampling. + */ + private int numbin = 20; + + /** + * Sampling + */ + private boolean sampling = false; + + /** + * Sampling + */ + private boolean exact = false; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter numbinP = new IntParameter(HISTOGRAM_BINS_ID, new GreaterEqualConstraint(2), 20); + if(config.grab(numbinP)) { + numbin = numbinP.getValue(); + } + + final Flag EXACT_FLAG = new Flag(EXACT_ID); + if(config.grab(EXACT_FLAG)) { + exact = EXACT_FLAG.getValue(); + } + + final Flag SAMPLING_FLAG = new Flag(SAMPLING_ID); + if(config.grab(SAMPLING_FLAG)) { + sampling = SAMPLING_FLAG.getValue(); + } + + ArrayList> exclusive = new ArrayList>(); + exclusive.add(EXACT_FLAG); + exclusive.add(SAMPLING_FLAG); + config.checkConstraint(new OnlyOneIsAllowedToBeSetGlobalConstraint(exclusive)); + } + + @Override + protected DistanceStatisticsWithClasses makeInstance() { + return new DistanceStatisticsWithClasses(distanceFunction, numbin, exact, sampling); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java new file mode 100644 index 00000000..cbbe578e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java @@ -0,0 +1,219 @@ +package de.lmu.ifi.dbs.elki.algorithm.statistics; +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2011 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.AggregatingHistogram; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.result.CollectionResult; +import de.lmu.ifi.dbs.elki.result.HistogramResult; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Evaluate a distance function with respect to kNN queries. For each point, the + * neighbors are sorted by distance, then the ROC AUC is computed. A score of 1 + * means that the distance function provides a perfect ordering of relevant + * neighbors first, then irrelevant neighbors. A value of 0.5 can be obtained by + * random sorting. A value of 0 means the distance function is inverted, i.e. a + * similarity. + * + * In contrast to {@link RankingQualityHistogram}, this method uses a binning + * based on the centrality of objects. This allows analyzing whether or not a + * particular distance degrades for the outer parts of a cluster. + * + * TODO: Allow fixed binning range, configurable + * + * TODO: Add sampling + * + * @author Erich Schubert + * @param Vector type + * @param Distance type + */ +@Title("Evaluate Ranking Quality") +@Description("Evaluates the effectiveness of a distance function via the obtained rankings.") +public class EvaluateRankingQuality, D extends NumberDistance> extends AbstractDistanceBasedAlgorithm> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(EvaluateRankingQuality.class); + + /** + * Option to configure the number of bins to use. + */ + public static final OptionID HISTOGRAM_BINS_ID = OptionID.getOrCreateOptionID("rankqual.bins", "Number of bins to use in the histogram"); + + /** + * Constructor. + * + * @param distanceFunction + * @param numbins + */ + public EvaluateRankingQuality(DistanceFunction distanceFunction, int numbins) { + super(distanceFunction); + this.numbins = numbins; + } + + /** + * Number of bins to use. + */ + int numbins = 20; + + /** + * Run the algorithm. + */ + @Override + public HistogramResult run(Database database) throws IllegalStateException { + final Relation relation = database.getRelation(getInputTypeRestriction()[0]); + final DistanceQuery distQuery = database.getDistanceQuery(relation, getDistanceFunction()); + final KNNQuery knnQuery = database.getKNNQuery(distQuery, relation.size()); + + if(logger.isVerbose()) { + logger.verbose("Preprocessing clusters..."); + } + // Cluster by labels + Collection> split = (new ByLabelClustering()).run(database).getAllClusters(); + + // Compute cluster averages and covariance matrix + HashMap, V> averages = new HashMap, V>(split.size()); + HashMap, Matrix> covmats = new HashMap, Matrix>(split.size()); + for(Cluster clus : split) { + averages.put(clus, DatabaseUtil.centroid(relation, clus.getIDs())); + covmats.put(clus, DatabaseUtil.covarianceMatrix(relation, clus.getIDs())); + } + + AggregatingHistogram hist = AggregatingHistogram.MeanVarianceHistogram(numbins, 0.0, 1.0); + + if(logger.isVerbose()) { + logger.verbose("Processing points..."); + } + FiniteProgress rocloop = logger.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), logger) : null; + + // sort neighbors + for(Cluster clus : split) { + ArrayList> cmem = new ArrayList>(clus.size()); + Vector av = averages.get(clus).getColumnVector(); + Matrix covm = covmats.get(clus); + + for(DBID i1 : clus.getIDs()) { + Double d = MathUtil.mahalanobisDistance(covm, av.minus(relation.get(i1).getColumnVector())); + cmem.add(new FCPair(d, i1)); + } + Collections.sort(cmem); + + for(int ind = 0; ind < cmem.size(); ind++) { + DBID i1 = cmem.get(ind).getSecond(); + List> knn = knnQuery.getKNNForDBID(i1, relation.size()); + double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); + + hist.aggregate(((double) ind) / clus.size(), result); + + if(rocloop != null) { + rocloop.incrementProcessed(logger); + } + } + } + if(rocloop != null) { + rocloop.ensureCompleted(logger); + } + // Collections.sort(results); + + // Transform Histogram into a Double Vector array. + Collection res = new ArrayList(relation.size()); + for(Pair pair : hist) { + DoubleVector row = new DoubleVector(new double[] { pair.getFirst(), pair.getSecond().getCount(), pair.getSecond().getMean(), pair.getSecond().getSampleVariance() }); + res.add(row); + } + return new HistogramResult("Ranking Quality Histogram", "ranking-histogram", res); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(new CombinedTypeInformation(getDistanceFunction().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD)); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer, D extends NumberDistance> extends AbstractDistanceBasedAlgorithm.Parameterizer { + protected int numbins = 20; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter param = new IntParameter(HISTOGRAM_BINS_ID, new GreaterEqualConstraint(2), 20); + if(config.grab(param)) { + numbins = param.getValue(); + } + } + + @Override + protected EvaluateRankingQuality makeInstance() { + return new EvaluateRankingQuality(distanceFunction, numbins); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java new file mode 100644 index 00000000..5fdc2c9f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java @@ -0,0 +1,183 @@ +package de.lmu.ifi.dbs.elki.algorithm.statistics; +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2011 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.DoubleVector; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.AggregatingHistogram; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.CollectionResult; +import de.lmu.ifi.dbs.elki.result.HistogramResult; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * Evaluate a distance function with respect to kNN queries. For each point, the + * neighbors are sorted by distance, then the ROC AUC is computed. A score of 1 + * means that the distance function provides a perfect ordering of relevant + * neighbors first, then irrelevant neighbors. A value of 0.5 can be obtained by + * random sorting. A value of 0 means the distance function is inverted, i.e. a + * similarity. + * + * TODO: Add sampling + * + * @author Erich Schubert + * @param Object type + * @param Distance type + */ +@Title("Ranking Quality Histogram") +@Description("Evaluates the effectiveness of a distance function via the obtained rankings.") +public class RankingQualityHistogram> extends AbstractDistanceBasedAlgorithm> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(RankingQualityHistogram.class); + + /** + * Option to configure the number of bins to use. + */ + public static final OptionID HISTOGRAM_BINS_ID = OptionID.getOrCreateOptionID("rankqual.bins", "Number of bins to use in the histogram"); + + /** + * Number of bins to use. + */ + int numbins = 100; + + /** + * Constructor. + * + * @param distanceFunction Distance function to evaluate + * @param numbins Number of bins + */ + public RankingQualityHistogram(DistanceFunction distanceFunction, int numbins) { + super(distanceFunction); + this.numbins = numbins; + } + + public HistogramResult run(Database database, Relation relation) { + final DistanceQuery distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); + final KNNQuery knnQuery = database.getKNNQuery(distanceQuery, relation.size()); + + if(logger.isVerbose()) { + logger.verbose("Preprocessing clusters..."); + } + // Cluster by labels + Collection> split = (new ByLabelClustering()).run(database).getAllClusters(); + + AggregatingHistogram hist = AggregatingHistogram.DoubleSumHistogram(numbins, 0.0, 1.0); + + if(logger.isVerbose()) { + logger.verbose("Processing points..."); + } + FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), logger) : null; + + MeanVariance mv = new MeanVariance(); + // sort neighbors + for(Cluster clus : split) { + for(DBID i1 : clus.getIDs()) { + List> knn = knnQuery.getKNNForDBID(i1, relation.size()); + double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); + + mv.put(result); + hist.aggregate(result, 1. / relation.size()); + + if(progress != null) { + progress.incrementProcessed(logger); + } + } + } + if(progress != null) { + progress.ensureCompleted(logger); + } + + // Transform Histogram into a Double Vector array. + Collection res = new ArrayList(relation.size()); + for(Pair pair : hist) { + DoubleVector row = new DoubleVector(new double[] { pair.getFirst(), pair.getSecond() }); + res.add(row); + } + HistogramResult result = new HistogramResult("Ranking Quality Histogram", "ranking-histogram", res); + result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance()); + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer> extends AbstractDistanceBasedAlgorithm.Parameterizer { + protected int numbins = 20; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter param = new IntParameter(HISTOGRAM_BINS_ID, new GreaterEqualConstraint(2), 100); + if(config.grab(param)) { + numbins = param.getValue(); + } + } + + @Override + protected RankingQualityHistogram makeInstance() { + return new RankingQualityHistogram(distanceFunction, numbins); + } + } +} \ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java new file mode 100644 index 00000000..3928f01a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/package-info.java @@ -0,0 +1,29 @@ +/** + *

Statistical analysis algorithms

+ * + * The algorithms in this package perform statistical analysis of the data + * (e.g. compute distributions, distance distributions etc.) + */ +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2011 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +package de.lmu.ifi.dbs.elki.algorithm.statistics; \ No newline at end of file -- cgit v1.2.3