summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments')
-rw-r--r--src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java47
-rw-r--r--src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java149
-rw-r--r--src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java361
-rw-r--r--src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java26
4 files changed, 583 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java
new file mode 100644
index 00000000..2471b135
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java
@@ -0,0 +1,47 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.evaluation.Evaluator;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.result.Result;
+import de.lmu.ifi.dbs.elki.result.ResultUtil;
+
+/**
+ * Evaluate clustering results by building segments for their pairs: shared
+ * pairs and differences.
+ *
+ * @author Sascha Goldhofer
+ * @author Erich Schubert
+ *
+ * @apiviz.uses Clustering
+ * @apiviz.uses Segments
+ */
+public class ClusterPairSegmentAnalysis implements Evaluator {
+ /**
+ * Constructor.
+ */
+ public ClusterPairSegmentAnalysis() {
+ super();
+ }
+
+ /**
+ * Perform clusterings evaluation
+ */
+ @Override
+ public void processNewResult(HierarchicalResult baseResult, Result result) {
+ // Get all new clusterings
+ // TODO: handle clusterings added later, too. Can we update the result?
+
+ List<Clustering<?>> clusterings = ResultUtil.getClusteringResults(result);
+ // Abort if not enough clusterings to compare
+ if(clusterings.size() < 2) {
+ return;
+ }
+
+ // create segments
+ Segments segments = new Segments(clusterings, baseResult);
+ baseResult.getHierarchy().add(result, segments);
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java
new file mode 100644
index 00000000..8ce8fd4a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java
@@ -0,0 +1,149 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+
+/**
+ * A segment represents a set of pairs that share the same clustering
+ * properties.
+ *
+ * As such, for each ring (= clustering), a cluster number (or the constant
+ * {@link #UNCLUSTERED}) is stored.
+ */
+public class Segment implements Comparable<Segment> {
+ /**
+ * Object is not clustered
+ */
+ public static final int UNCLUSTERED = -1;
+
+ /**
+ * IDs in segment, for object segments.
+ */
+ protected DBIDs objIDs = null;
+
+ /**
+ * Size of cluster, in pairs.
+ */
+ protected long pairsize = 0;
+
+ /**
+ * The cluster numbers in each ring
+ */
+ protected int[] clusterIds;
+
+ public Segment(int clusterings) {
+ clusterIds = new int[clusterings];
+ }
+
+ public long getPairCount() {
+ return pairsize;
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param clone Clone of cluster ids
+ */
+ public Segment(int[] clone) {
+ clusterIds = clone;
+ }
+
+ /**
+ * Get cluster number for index idx.
+ *
+ * @param idx Index
+ * @return Cluster number
+ */
+ public int get(int idx) {
+ return clusterIds[idx];
+ }
+
+ /**
+ * Checks if the segment has a cluster with unpaired objects. Unpaired
+ * clusters are represented by "0" (0 = all).
+ *
+ * @return true when unclustered in at least one dimension.
+ */
+ public boolean isUnpaired() {
+ for(int id : clusterIds) {
+ if(id == UNCLUSTERED) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Check if this segment contains the pairs that are never clustered by any of
+ * the clusterings (all 0).
+ *
+ * @return true when unclustered everywhere
+ */
+ public boolean isNone() {
+ for(int id : clusterIds) {
+ if(id != UNCLUSTERED) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns the index of the first clustering having an unpaired cluster, or -1
+ * no unpaired cluster exists.
+ *
+ * @return clustering id or -1
+ */
+ public int getUnpairedClusteringIndex() {
+ for(int index = 0; index < clusterIds.length; index++) {
+ if(clusterIds[index] == UNCLUSTERED) {
+ return index;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Get the DBIDs of objects contained in this segment.
+ *
+ * @return the segment IDs
+ */
+ public DBIDs getDBIDs() {
+ return objIDs;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if(!(Segment.class.isInstance(obj))) {
+ return false;
+ }
+ Segment other = (Segment) obj;
+ return Arrays.equals(clusterIds, other.clusterIds);
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(clusterIds);
+ }
+
+ @Override
+ public int compareTo(Segment sid) {
+ for(int i = 0; i < clusterIds.length; i++) {
+ final int a = this.clusterIds[i];
+ final int b = sid.clusterIds[i];
+ if(a != b) {
+ if(a * b > 0) {
+ // Regular comparison
+ return (a < b) ? -1 : +1;
+ // return (a < b) ? +1 : -1;
+ }
+ else {
+ // Inverse, to sort negative last
+ return (a < b) ? +1 : -1;
+ }
+ }
+ }
+ return 0;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
new file mode 100644
index 00000000..97c79dfb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
@@ -0,0 +1,361 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.TreeMap;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.BasicResult;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Creates segments of two or more clusterings.
+ *
+ * <p>
+ * Segments are the equally paired database objects of all given (2+)
+ * clusterings. Given a contingency table, an object Segment represents the
+ * table's cells where an intersection of classes and labels are given. Pair
+ * Segments are created by converting an object Segment into its pair
+ * representation. Converting all object Segments into pair Segments results in
+ * a larger number of pair Segments, if any fragmentation (no perfect match of
+ * clusters) within the contingency table has occurred (multiple cells on one
+ * row or column). Thus for ever object Segment exists a corresponding pair
+ * Segment. Additionally pair Segments represent pairs that are only in one
+ * Clustering which occurs for each split of a clusterings cluster by another
+ * clustering. Here, these pair Segments are referenced as fragmented Segments.
+ * Within the visualization they describe (at least two) pair Segments that have
+ * a corresponding object Segment.
+ * </p>
+ *
+ * <p>
+ * Reference:<br />
+ * Evaluation of Clusterings – Metrics and Visual Support<br />
+ * Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur
+ * Zimek<br />
+ * In: Proc. 28th International Conference on Data Engineering (ICDE) 2012
+ * </p>
+ *
+ * @author Sascha Goldhofer
+ * @author Erich Schubert
+ */
+@Reference(title = "Evaluation of Clusterings – Metrics and Visual Support", authors = "Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", booktitle = "Proc. 28th International Conference on Data Engineering (ICDE) 2012", url = "http://elki.dbs.ifi.lmu.de/wiki/PairSegments")
+public class Segments extends BasicResult implements Iterable<Segment> {
+ /**
+ * Class logger
+ */
+ private static final Logging logger = Logging.getLogger(Segments.class);
+
+ /**
+ * Clusterings
+ */
+ private List<Clustering<?>> clusterings;
+
+ /**
+ * Clusters
+ */
+ private List<List<? extends Cluster<?>>> clusters;
+
+ /**
+ * Number of clusterings in comparison
+ */
+ private int clusteringsCount;
+
+ /**
+ * Number of Clusters for each clustering
+ */
+ private int[] numclusters;
+
+ /**
+ * Total number of objects
+ */
+ private int totalObjects;
+
+ /**
+ * Pairs actually present in the data set
+ */
+ private int actualPairs;
+
+ /**
+ * The actual segments
+ */
+ private TreeMap<Segment, Segment> segments;
+
+ /**
+ * Initialize segments. Add DB objects via addObject method.
+ *
+ * @param clusterings List of clusterings in comparison
+ * @param baseResult used to retrieve db objects
+ */
+ public Segments(List<Clustering<?>> clusterings, HierarchicalResult baseResult) {
+ super("cluster pair segments", "pair-segments");
+ this.clusterings = clusterings;
+ this.clusteringsCount = clusterings.size();
+ segments = new TreeMap<Segment, Segment>(); // TODO: replace with array list
+
+ numclusters = new int[clusteringsCount];
+ clusters = new ArrayList<List<? extends Cluster<?>>>(clusteringsCount);
+
+ // save count of clusters
+ int clusteringIndex = 0;
+ for(Clustering<?> clr : clusterings) {
+ List<? extends Cluster<?>> curClusters = clr.getAllClusters();
+ clusters.add(curClusters);
+ numclusters[clusteringIndex] = curClusters.size();
+
+ clusteringIndex++;
+ }
+
+ recursivelyFill(clusters);
+ for(Segment seg : segments.keySet()) {
+ actualPairs += seg.getPairCount();
+ }
+ }
+
+ private void recursivelyFill(List<List<? extends Cluster<?>>> cs) {
+ final int numclusterings = cs.size();
+ Iterator<? extends Cluster<?>> iter = cs.get(0).iterator();
+ int[] path = new int[numclusterings];
+ for(int cnum = 0; iter.hasNext(); cnum++) {
+ Cluster<?> clust = iter.next();
+ path[0] = cnum;
+ if(numclusterings > 1) {
+ SetDBIDs idset = DBIDUtil.ensureSet(clust.getIDs());
+ recursivelyFill(cs, 1, idset, idset, path, true);
+ }
+ else {
+ // Add to results.
+ makeOrUpdateSegment(path, clust.getIDs(), clust.size() * (clust.size() - 1));
+ }
+
+ totalObjects += clust.size();
+ }
+ }
+
+ private void recursivelyFill(List<List<? extends Cluster<?>>> cs, int depth, SetDBIDs first, SetDBIDs second, int[] path, boolean objectsegment) {
+ final int numclusterings = cs.size();
+ Iterator<? extends Cluster<?>> iter = cs.get(depth).iterator();
+ for(int cnum = 0; iter.hasNext(); cnum++) {
+ Cluster<?> clust = iter.next();
+ // Compute intersections with new cluster.
+ // nfp := intersection( first, cluster )
+ // Adding asymmetric differences to nd1, nd2.
+ // nse := intersection( second, cluster )
+ HashSetModifiableDBIDs nfirstp = DBIDUtil.newHashSet(first.size());
+ HashSetModifiableDBIDs ndelta1 = DBIDUtil.newHashSet(first);
+ HashSetModifiableDBIDs ndelta2 = DBIDUtil.newHashSet();
+ HashSetModifiableDBIDs nsecond = DBIDUtil.newHashSet(second.size());
+ for(DBID id : clust.getIDs()) {
+ if(ndelta1.remove(id)) {
+ nfirstp.add(id);
+ }
+ else {
+ ndelta2.add(id);
+ }
+ if(second.contains(id)) {
+ nsecond.add(id);
+ }
+ }
+ if(nsecond.size() <= 0) {
+ continue; // disjoint
+ }
+ if(nfirstp.size() > 0) {
+ path[depth] = cnum;
+ if(depth < numclusterings - 1) {
+ recursivelyFill(cs, depth + 1, nfirstp, nsecond, path, objectsegment);
+ }
+ else {
+ // Add to results.
+ // In fact, nfirstp should equal nsecond here
+ int selfpairs = DBIDUtil.intersection(nfirstp, nsecond).size();
+ if(objectsegment) {
+ makeOrUpdateSegment(path, nfirstp, (nfirstp.size() * nsecond.size()) - selfpairs);
+ }
+ else {
+ makeOrUpdateSegment(path, null, (nfirstp.size() * nsecond.size()) - selfpairs);
+ }
+ }
+ }
+ // Elements that were in first, but in not in the cluster
+ if(ndelta1.size() > 0) {
+ path[depth] = Segment.UNCLUSTERED;
+ if(depth < numclusterings - 1) {
+ recursivelyFill(cs, depth + 1, ndelta1, nsecond, path, false);
+ }
+ else {
+ // Add to results.
+ int selfpairs = DBIDUtil.intersection(ndelta1, nsecond).size();
+ makeOrUpdateSegment(path, null, (ndelta1.size() * nsecond.size()) - selfpairs);
+ }
+ }
+ // FIXME: this part doesn't work right yet for over 2 clusterings!
+ // It used to work in revision 9236, eventually go back to this code!
+ if(ndelta2.size() > 0 && objectsegment) {
+ int[] npath = new int[path.length];
+ Arrays.fill(npath, Segment.UNCLUSTERED);
+ npath[depth] = cnum;
+ if(depth < numclusterings - 1) {
+ recursivelyFill(cs, depth + 1, ndelta2, nsecond, npath, false);
+ }
+ else {
+ // Add to results.
+ int selfpairs = DBIDUtil.intersection(ndelta2, nsecond).size();
+ makeOrUpdateSegment(npath, null, (ndelta2.size() * nsecond.size()) - selfpairs);
+ }
+ }
+ }
+ }
+
+ private void makeOrUpdateSegment(int[] path, DBIDs ids, int pairsize) {
+ Segment seg = segments.get(new Segment(path));
+ if(seg == null) {
+ seg = new Segment(path.clone());
+ segments.put(seg, seg);
+ }
+ if(ids != null) {
+ if(seg.getDBIDs() != null) {
+ logger.warning("Expected segment to not have IDs.");
+ }
+ seg.objIDs = ids;
+ }
+ seg.pairsize += pairsize;
+ }
+
+ /**
+ * Get the description of the nth clustering.
+ *
+ * @param clusteringID Clustering number
+ * @return long name of clustering
+ */
+ public String getClusteringDescription(int clusteringID) {
+ return clusterings.get(clusteringID).getLongName();
+ }
+
+ /**
+ * Return to a given segment with unpaired objects, the corresponding segments
+ * that result in an unpaired segment. So, one cluster of a clustering is
+ * split by another clustering in multiple segments, resulting in a segment
+ * with unpaired objects, describing the missing pairs between the split
+ * cluster / between the segments.
+ *
+ * Basically we compare only two clusterings at once. If those clusterings do
+ * not have the whole cluster in common, we have at least three segments (two
+ * cluster), one of them containing the unpaired segment. A segmentID 3-0,
+ * describes a cluster 3 in clustering 1 (index 0) and all clusters 3-x in
+ * clustering 2. So we search for all segments 3-x (0 being a wildcard).
+ *
+ * @param unpairedSegment
+ * @return Segments describing the set of objects that result in an unpaired
+ * segment
+ */
+ public List<Segment> getPairedSegments(Segment unpairedSegment) {
+ ArrayList<Segment> pairedSegments = new ArrayList<Segment>();
+ // search the segments. Index at "unpairedClustering" being the wildcard.
+ segments: for(Segment segment : this) {
+ // if mismatch except at unpaired Clustering index => exclude.
+ for(int i = 0; i < clusteringsCount; i++) {
+ if(unpairedSegment.get(i) != Segment.UNCLUSTERED) {
+ // mismatch
+ if(segment.get(i) != unpairedSegment.get(i)) {
+ continue segments;
+ }
+ }
+ // do not add wildcard
+ if(segment.get(i) == Segment.UNCLUSTERED) {
+ continue segments;
+ }
+ }
+
+ // add segment to list
+ pairedSegments.add(segment);
+ }
+
+ return pairedSegments;
+ }
+
+ /**
+ * @param temp Temporary segment to be unified
+ * @return the segmentID given by its string representation
+ */
+ public Segment unifySegment(Segment temp) {
+ Segment found = segments.get(temp);
+ return (found != null) ? found : temp;
+ }
+
+ /**
+ * Get the number of segments
+ *
+ * @return Number of segments
+ */
+ public int size() {
+ return segments.size();
+ }
+
+ /**
+ * Get total number of pairs with or without the unclustered pairs.
+ *
+ * @param withUnclusteredPairs if false, segment with unclustered pairs is
+ * removed
+ * @return pair count, with or without unclusted (non-existant) pairs
+ */
+ public int getPairCount(boolean withUnclusteredPairs) {
+ if(withUnclusteredPairs) {
+ return (totalObjects * (totalObjects - 1)); // / 2;
+ }
+ else {
+ return actualPairs;
+ }
+ }
+
+ /**
+ * Get the number of clusterings
+ *
+ * @return number of clusterings compared
+ */
+ public int getClusterings() {
+ return clusteringsCount;
+ }
+
+ /**
+ * Return the sum of all clusters
+ *
+ * @return sum of all cluster counts
+ */
+ public int getTotalClusterCount() {
+ int clusterCount = 0;
+
+ for(int i = 0; i < numclusters.length; i++) {
+ clusterCount += numclusters[i];
+ }
+
+ return clusterCount;
+ }
+
+ /**
+ * Returns the highest number of Clusters in the clusterings
+ *
+ * @return highest cluster count
+ */
+ public int getHighestClusterCount() {
+ int maxClusters = 0;
+
+ for(int i = 0; i < numclusters.length; i++) {
+ maxClusters = Math.max(maxClusters, numclusters[i]);
+ }
+ return maxClusters;
+ }
+
+ @Override
+ public Iterator<Segment> iterator() {
+ return segments.keySet().iterator();
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java
new file mode 100644
index 00000000..27176ff6
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>Pair-segment analysis of multiple clusterings.</p>
+ */
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; \ No newline at end of file