package de.lmu.ifi.dbs.elki.evaluation.clustering.internal; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.result.EvaluationResult; import de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultHierarchy; import de.lmu.ifi.dbs.elki.result.ResultUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * * Compute the PBM of a data set * * Reference: *

* M. K. Pakhira, and S. Bandyopadhyay, and U. Maulik
* Validity index for crisp and fuzzy clusters
* Pattern recognition, 37(3) *

* * @author Stephan Baier * @author Erich Schubert * * @apiviz.composedOf NoiseHandling */ @Reference(authors = "M. K. Pakhira, and S. Bandyopadhyay, and U. Maulik", // title = "Validity index for crisp and fuzzy clusters",// booktitle = "Pattern recognition, 37(3)",// url = "http://dx.doi.org/10.1016/j.patcog.2003.06.005") public class EvaluatePBMIndex implements Evaluator { /** * Logger for debug output. */ private static final Logging LOG = Logging.getLogger(EvaluatePBMIndex.class); /** * Option for noise handling. */ private NoiseHandling noiseHandling; /** * Distance function to use. */ private NumberVectorDistanceFunction distanceFunction; /** * Key for logging statistics. */ private String key = EvaluatePBMIndex.class.getName(); /** * Constructor. * * @param distance Distance function * @param noiseOpt Flag to control noise handling */ public EvaluatePBMIndex(NumberVectorDistanceFunction distance, NoiseHandling noiseOpt) { super(); this.distanceFunction = distance; this.noiseHandling = noiseOpt; } /** * Evaluate a single clustering. * * @param db Database * @param rel Data relation * @param c Clustering * @return PBM */ public double evaluateClustering(Database db, Relation rel, Clustering c) { List> clusters = c.getAllClusters(); NumberVector[] centroids = new NumberVector[clusters.size()]; int ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseHandling); // Build global centroid and cluster count: final int dim = RelationUtil.dimensionality(rel); Centroid overallCentroid = new Centroid(dim); EvaluateVarianceRatioCriteria.globalCentroid(overallCentroid, rel, clusters, centroids, noiseHandling); // Maximum distance between centroids: double max = 0; for(int i = 0; i < centroids.length; i++) { if(centroids[i] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) { continue; } for(int j = i + 1; j < centroids.length; j++) { if(centroids[j] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) { continue; } if(centroids[i] == null && centroids[j] == null) { // Need to compute pairwise distances of noise clusters. for(DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) { for(DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) { double dist = distanceFunction.distance(rel.get(iti), rel.get(itj)); max = dist > max ? dist : max; } } } else if(centroids[i] == null) { for(DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) { double dist = distanceFunction.distance(rel.get(iti), centroids[j]); max = dist > max ? dist : max; } } else if(centroids[j] == null) { for(DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) { double dist = distanceFunction.distance(centroids[i], rel.get(itj)); max = dist > max ? dist : max; } } else { double dist = distanceFunction.distance(centroids[i], centroids[j]); max = dist > max ? dist : max; } } } // a: Distance to own centroid // b: Distance to overall centroid double a = 0, b = 0; Iterator> ci = clusters.iterator(); for(int i = 0; ci.hasNext(); i++) { Cluster cluster = ci.next(); if(cluster.size() <= 1 || cluster.isNoise()) { switch(noiseHandling){ case IGNORE_NOISE: continue; // Ignored case TREAT_NOISE_AS_SINGLETONS: // Singletons: a = 0 by definition. for(DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) { b += SquaredEuclideanDistanceFunction.STATIC.distance(overallCentroid, rel.get(it)); } continue; // with NEXT cluster. case MERGE_NOISE: break; // Treat like a cluster below: } } for(DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) { NumberVector obj = rel.get(it); a += distanceFunction.distance(centroids[i], obj); b += distanceFunction.distance(overallCentroid, obj); } } final double pbm = Math.pow((1. / centroids.length) * (b / a) * max, 2.); if(LOG.isStatistics()) { LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString())); if(ignorednoise > 0) { LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise)); } LOG.statistics(new DoubleStatistic(key + ".pbm", pbm)); } EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation"); MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation"); g.addMeasure("PBM-Index", pbm, 0., Double.POSITIVE_INFINITY, 0., false); db.getHierarchy().resultChanged(ev); return pbm; } @Override public void processNewResult(ResultHierarchy hier, Result result) { List> crs = ResultUtil.getClusteringResults(result); if(crs.size() < 1) { return; } Database db = ResultUtil.findDatabase(hier); Relation rel = db.getRelation(this.distanceFunction.getInputTypeRestriction()); for(Clustering c : crs) { evaluateClustering(db, (Relation) rel, c); } } /** * Parameterization class. * * @author Stephan Baier * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { /** * Parameter for choosing the distance function. */ public static final OptionID DISTANCE_ID = new OptionID("pbm.distance", "Distance function to use for computing PBM."); /** * Parameter for the option, how noise should be treated. */ public static final OptionID NOISE_ID = new OptionID("pbm.noisehandling", "Control how noise should be treated."); /** * Distance function to use. */ private NumberVectorDistanceFunction distance; /** * Option, how noise should be treated. */ private NoiseHandling noiseHandling; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter> distanceFunctionP = new ObjectParameter<>(DISTANCE_ID, NumberVectorDistanceFunction.class, EuclideanDistanceFunction.class); if(config.grab(distanceFunctionP)) { distance = distanceFunctionP.instantiateClass(config); } EnumParameter noiseP = new EnumParameter(NOISE_ID, NoiseHandling.class, NoiseHandling.TREAT_NOISE_AS_SINGLETONS); if(config.grab(noiseP)) { noiseHandling = noiseP.getValue(); } } @Override protected EvaluatePBMIndex makeInstance() { return new EvaluatePBMIndex(distance, noiseHandling); } } }