diff options
author | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:41 +0000 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:41 +0000 |
commit | 38212b3127e90751fb39cda34250bc11be62b76c (patch) | |
tree | dc1397346030e9695bd763dddc93b3be527cd643 /elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java | |
parent | 337087b668d3a54f3afee3a9adb597a32e9f7e94 (diff) |
Import Upstream version 0.7.0
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java | 275 |
1 files changed, 275 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java new file mode 100644 index 00000000..2c3a20e0 --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/evaluation/clustering/internal/EvaluatePBMIndex.java @@ -0,0 +1,275 @@ +package de.lmu.ifi.dbs.elki.evaluation.clustering.internal; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.evaluation.Evaluator; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; +import de.lmu.ifi.dbs.elki.result.EvaluationResult; +import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultHierarchy; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * + * Compute the PBM of a data set + * + * Reference: + * <p> + * M. K. Pakhira, and S. Bandyopadhyay, and U. Maulik<br /> + * Validity index for crisp and fuzzy clusters<br /> + * Pattern recognition, 37(3) + * </p> + * + * @author Stephan Baier + * @author Erich Schubert + * + * @apiviz.composedOf NoiseHandling + */ +@Reference(authors = "M. K. Pakhira, and S. Bandyopadhyay, and U. Maulik", // +title = "Validity index for crisp and fuzzy clusters",// +booktitle = "Pattern recognition, 37(3)",// +url = "http://dx.doi.org/10.1016/j.patcog.2003.06.005") +public class EvaluatePBMIndex implements Evaluator { + /** + * Logger for debug output. + */ + private static final Logging LOG = Logging.getLogger(EvaluatePBMIndex.class); + + /** + * Option for noise handling. + */ + private NoiseHandling noiseHandling; + + /** + * Distance function to use. + */ + private NumberVectorDistanceFunction<?> distanceFunction; + + /** + * Key for logging statistics. + */ + private String key = EvaluatePBMIndex.class.getName(); + + /** + * Constructor. + * + * @param distance Distance function + * @param noiseOpt Flag to control noise handling + */ + public EvaluatePBMIndex(NumberVectorDistanceFunction<?> distance, NoiseHandling noiseOpt) { + super(); + this.distanceFunction = distance; + this.noiseHandling = noiseOpt; + } + + /** + * Evaluate a single clustering. + * + * @param db Database + * @param rel Data relation + * @param c Clustering + * @return PBM + */ + public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) { + List<? extends Cluster<?>> clusters = c.getAllClusters(); + NumberVector[] centroids = new NumberVector[clusters.size()]; + int ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseHandling); + + // Build global centroid and cluster count: + final int dim = RelationUtil.dimensionality(rel); + Centroid overallCentroid = new Centroid(dim); + EvaluateVarianceRatioCriteria.globalCentroid(overallCentroid, rel, clusters, centroids, noiseHandling); + + // Maximum distance between centroids: + double max = 0; + for(int i = 0; i < centroids.length; i++) { + if(centroids[i] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) { + continue; + } + for(int j = i + 1; j < centroids.length; j++) { + if(centroids[j] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) { + continue; + } + if(centroids[i] == null && centroids[j] == null) { + // Need to compute pairwise distances of noise clusters. + for(DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) { + for(DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) { + double dist = distanceFunction.distance(rel.get(iti), rel.get(itj)); + max = dist > max ? dist : max; + } + } + } + else if(centroids[i] == null) { + for(DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) { + double dist = distanceFunction.distance(rel.get(iti), centroids[j]); + max = dist > max ? dist : max; + } + } + else if(centroids[j] == null) { + for(DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) { + double dist = distanceFunction.distance(centroids[i], rel.get(itj)); + max = dist > max ? dist : max; + } + } + else { + double dist = distanceFunction.distance(centroids[i], centroids[j]); + max = dist > max ? dist : max; + } + } + } + + // a: Distance to own centroid + // b: Distance to overall centroid + double a = 0, b = 0; + Iterator<? extends Cluster<?>> ci = clusters.iterator(); + for(int i = 0; ci.hasNext(); i++) { + Cluster<?> cluster = ci.next(); + if(cluster.size() <= 1 || cluster.isNoise()) { + switch(noiseHandling){ + case IGNORE_NOISE: + continue; // Ignored + case TREAT_NOISE_AS_SINGLETONS: + // Singletons: a = 0 by definition. + for(DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) { + b += SquaredEuclideanDistanceFunction.STATIC.distance(overallCentroid, rel.get(it)); + } + continue; // with NEXT cluster. + case MERGE_NOISE: + break; // Treat like a cluster below: + } + } + + for(DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) { + NumberVector obj = rel.get(it); + a += distanceFunction.distance(centroids[i], obj); + b += distanceFunction.distance(overallCentroid, obj); + } + } + + final double pbm = Math.pow((1. / centroids.length) * (b / a) * max, 2.); + + if(LOG.isStatistics()) { + LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString())); + if(ignorednoise > 0) { + LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise)); + } + LOG.statistics(new DoubleStatistic(key + ".pbm", pbm)); + } + + EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation"); + MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation"); + g.addMeasure("PBM-Index", pbm, 0., Double.POSITIVE_INFINITY, 0., false); + db.getHierarchy().resultChanged(ev); + return pbm; + } + + @Override + public void processNewResult(ResultHierarchy hier, Result result) { + List<Clustering<?>> crs = ResultUtil.getClusteringResults(result); + if(crs.size() < 1) { + return; + } + Database db = ResultUtil.findDatabase(hier); + Relation<? extends NumberVector> rel = db.getRelation(this.distanceFunction.getInputTypeRestriction()); + + for(Clustering<?> c : crs) { + evaluateClustering(db, (Relation<? extends NumberVector>) rel, c); + } + } + + /** + * Parameterization class. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter for choosing the distance function. + */ + public static final OptionID DISTANCE_ID = new OptionID("pbm.distance", "Distance function to use for computing PBM."); + + /** + * Parameter for the option, how noise should be treated. + */ + public static final OptionID NOISE_ID = new OptionID("pbm.noisehandling", "Control how noise should be treated."); + + /** + * Distance function to use. + */ + private NumberVectorDistanceFunction<?> distance; + + /** + * Option, how noise should be treated. + */ + private NoiseHandling noiseHandling; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + ObjectParameter<NumberVectorDistanceFunction<?>> distanceFunctionP = new ObjectParameter<>(DISTANCE_ID, NumberVectorDistanceFunction.class, EuclideanDistanceFunction.class); + if(config.grab(distanceFunctionP)) { + distance = distanceFunctionP.instantiateClass(config); + } + + EnumParameter<NoiseHandling> noiseP = new EnumParameter<NoiseHandling>(NOISE_ID, NoiseHandling.class, NoiseHandling.TREAT_NOISE_AS_SINGLETONS); + if(config.grab(noiseP)) { + noiseHandling = noiseP.getValue(); + } + } + + @Override + protected EvaluatePBMIndex makeInstance() { + return new EvaluatePBMIndex(distance, noiseHandling); + } + } + +} |