summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java361
1 files changed, 361 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
new file mode 100644
index 00000000..97c79dfb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
@@ -0,0 +1,361 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.TreeMap;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.BasicResult;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Creates segments of two or more clusterings.
+ *
+ * <p>
+ * Segments are the equally paired database objects of all given (2+)
+ * clusterings. Given a contingency table, an object Segment represents the
+ * table's cells where an intersection of classes and labels are given. Pair
+ * Segments are created by converting an object Segment into its pair
+ * representation. Converting all object Segments into pair Segments results in
+ * a larger number of pair Segments, if any fragmentation (no perfect match of
+ * clusters) within the contingency table has occurred (multiple cells on one
+ * row or column). Thus for ever object Segment exists a corresponding pair
+ * Segment. Additionally pair Segments represent pairs that are only in one
+ * Clustering which occurs for each split of a clusterings cluster by another
+ * clustering. Here, these pair Segments are referenced as fragmented Segments.
+ * Within the visualization they describe (at least two) pair Segments that have
+ * a corresponding object Segment.
+ * </p>
+ *
+ * <p>
+ * Reference:<br />
+ * Evaluation of Clusterings – Metrics and Visual Support<br />
+ * Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur
+ * Zimek<br />
+ * In: Proc. 28th International Conference on Data Engineering (ICDE) 2012
+ * </p>
+ *
+ * @author Sascha Goldhofer
+ * @author Erich Schubert
+ */
+@Reference(title = "Evaluation of Clusterings – Metrics and Visual Support", authors = "Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", booktitle = "Proc. 28th International Conference on Data Engineering (ICDE) 2012", url = "http://elki.dbs.ifi.lmu.de/wiki/PairSegments")
+public class Segments extends BasicResult implements Iterable<Segment> {
+ /**
+ * Class logger
+ */
+ private static final Logging logger = Logging.getLogger(Segments.class);
+
+ /**
+ * Clusterings
+ */
+ private List<Clustering<?>> clusterings;
+
+ /**
+ * Clusters
+ */
+ private List<List<? extends Cluster<?>>> clusters;
+
+ /**
+ * Number of clusterings in comparison
+ */
+ private int clusteringsCount;
+
+ /**
+ * Number of Clusters for each clustering
+ */
+ private int[] numclusters;
+
+ /**
+ * Total number of objects
+ */
+ private int totalObjects;
+
+ /**
+ * Pairs actually present in the data set
+ */
+ private int actualPairs;
+
+ /**
+ * The actual segments
+ */
+ private TreeMap<Segment, Segment> segments;
+
+ /**
+ * Initialize segments. Add DB objects via addObject method.
+ *
+ * @param clusterings List of clusterings in comparison
+ * @param baseResult used to retrieve db objects
+ */
+ public Segments(List<Clustering<?>> clusterings, HierarchicalResult baseResult) {
+ super("cluster pair segments", "pair-segments");
+ this.clusterings = clusterings;
+ this.clusteringsCount = clusterings.size();
+ segments = new TreeMap<Segment, Segment>(); // TODO: replace with array list
+
+ numclusters = new int[clusteringsCount];
+ clusters = new ArrayList<List<? extends Cluster<?>>>(clusteringsCount);
+
+ // save count of clusters
+ int clusteringIndex = 0;
+ for(Clustering<?> clr : clusterings) {
+ List<? extends Cluster<?>> curClusters = clr.getAllClusters();
+ clusters.add(curClusters);
+ numclusters[clusteringIndex] = curClusters.size();
+
+ clusteringIndex++;
+ }
+
+ recursivelyFill(clusters);
+ for(Segment seg : segments.keySet()) {
+ actualPairs += seg.getPairCount();
+ }
+ }
+
+ private void recursivelyFill(List<List<? extends Cluster<?>>> cs) {
+ final int numclusterings = cs.size();
+ Iterator<? extends Cluster<?>> iter = cs.get(0).iterator();
+ int[] path = new int[numclusterings];
+ for(int cnum = 0; iter.hasNext(); cnum++) {
+ Cluster<?> clust = iter.next();
+ path[0] = cnum;
+ if(numclusterings > 1) {
+ SetDBIDs idset = DBIDUtil.ensureSet(clust.getIDs());
+ recursivelyFill(cs, 1, idset, idset, path, true);
+ }
+ else {
+ // Add to results.
+ makeOrUpdateSegment(path, clust.getIDs(), clust.size() * (clust.size() - 1));
+ }
+
+ totalObjects += clust.size();
+ }
+ }
+
+ private void recursivelyFill(List<List<? extends Cluster<?>>> cs, int depth, SetDBIDs first, SetDBIDs second, int[] path, boolean objectsegment) {
+ final int numclusterings = cs.size();
+ Iterator<? extends Cluster<?>> iter = cs.get(depth).iterator();
+ for(int cnum = 0; iter.hasNext(); cnum++) {
+ Cluster<?> clust = iter.next();
+ // Compute intersections with new cluster.
+ // nfp := intersection( first, cluster )
+ // Adding asymmetric differences to nd1, nd2.
+ // nse := intersection( second, cluster )
+ HashSetModifiableDBIDs nfirstp = DBIDUtil.newHashSet(first.size());
+ HashSetModifiableDBIDs ndelta1 = DBIDUtil.newHashSet(first);
+ HashSetModifiableDBIDs ndelta2 = DBIDUtil.newHashSet();
+ HashSetModifiableDBIDs nsecond = DBIDUtil.newHashSet(second.size());
+ for(DBID id : clust.getIDs()) {
+ if(ndelta1.remove(id)) {
+ nfirstp.add(id);
+ }
+ else {
+ ndelta2.add(id);
+ }
+ if(second.contains(id)) {
+ nsecond.add(id);
+ }
+ }
+ if(nsecond.size() <= 0) {
+ continue; // disjoint
+ }
+ if(nfirstp.size() > 0) {
+ path[depth] = cnum;
+ if(depth < numclusterings - 1) {
+ recursivelyFill(cs, depth + 1, nfirstp, nsecond, path, objectsegment);
+ }
+ else {
+ // Add to results.
+ // In fact, nfirstp should equal nsecond here
+ int selfpairs = DBIDUtil.intersection(nfirstp, nsecond).size();
+ if(objectsegment) {
+ makeOrUpdateSegment(path, nfirstp, (nfirstp.size() * nsecond.size()) - selfpairs);
+ }
+ else {
+ makeOrUpdateSegment(path, null, (nfirstp.size() * nsecond.size()) - selfpairs);
+ }
+ }
+ }
+ // Elements that were in first, but in not in the cluster
+ if(ndelta1.size() > 0) {
+ path[depth] = Segment.UNCLUSTERED;
+ if(depth < numclusterings - 1) {
+ recursivelyFill(cs, depth + 1, ndelta1, nsecond, path, false);
+ }
+ else {
+ // Add to results.
+ int selfpairs = DBIDUtil.intersection(ndelta1, nsecond).size();
+ makeOrUpdateSegment(path, null, (ndelta1.size() * nsecond.size()) - selfpairs);
+ }
+ }
+ // FIXME: this part doesn't work right yet for over 2 clusterings!
+ // It used to work in revision 9236, eventually go back to this code!
+ if(ndelta2.size() > 0 && objectsegment) {
+ int[] npath = new int[path.length];
+ Arrays.fill(npath, Segment.UNCLUSTERED);
+ npath[depth] = cnum;
+ if(depth < numclusterings - 1) {
+ recursivelyFill(cs, depth + 1, ndelta2, nsecond, npath, false);
+ }
+ else {
+ // Add to results.
+ int selfpairs = DBIDUtil.intersection(ndelta2, nsecond).size();
+ makeOrUpdateSegment(npath, null, (ndelta2.size() * nsecond.size()) - selfpairs);
+ }
+ }
+ }
+ }
+
+ private void makeOrUpdateSegment(int[] path, DBIDs ids, int pairsize) {
+ Segment seg = segments.get(new Segment(path));
+ if(seg == null) {
+ seg = new Segment(path.clone());
+ segments.put(seg, seg);
+ }
+ if(ids != null) {
+ if(seg.getDBIDs() != null) {
+ logger.warning("Expected segment to not have IDs.");
+ }
+ seg.objIDs = ids;
+ }
+ seg.pairsize += pairsize;
+ }
+
+ /**
+ * Get the description of the nth clustering.
+ *
+ * @param clusteringID Clustering number
+ * @return long name of clustering
+ */
+ public String getClusteringDescription(int clusteringID) {
+ return clusterings.get(clusteringID).getLongName();
+ }
+
+ /**
+ * Return to a given segment with unpaired objects, the corresponding segments
+ * that result in an unpaired segment. So, one cluster of a clustering is
+ * split by another clustering in multiple segments, resulting in a segment
+ * with unpaired objects, describing the missing pairs between the split
+ * cluster / between the segments.
+ *
+ * Basically we compare only two clusterings at once. If those clusterings do
+ * not have the whole cluster in common, we have at least three segments (two
+ * cluster), one of them containing the unpaired segment. A segmentID 3-0,
+ * describes a cluster 3 in clustering 1 (index 0) and all clusters 3-x in
+ * clustering 2. So we search for all segments 3-x (0 being a wildcard).
+ *
+ * @param unpairedSegment
+ * @return Segments describing the set of objects that result in an unpaired
+ * segment
+ */
+ public List<Segment> getPairedSegments(Segment unpairedSegment) {
+ ArrayList<Segment> pairedSegments = new ArrayList<Segment>();
+ // search the segments. Index at "unpairedClustering" being the wildcard.
+ segments: for(Segment segment : this) {
+ // if mismatch except at unpaired Clustering index => exclude.
+ for(int i = 0; i < clusteringsCount; i++) {
+ if(unpairedSegment.get(i) != Segment.UNCLUSTERED) {
+ // mismatch
+ if(segment.get(i) != unpairedSegment.get(i)) {
+ continue segments;
+ }
+ }
+ // do not add wildcard
+ if(segment.get(i) == Segment.UNCLUSTERED) {
+ continue segments;
+ }
+ }
+
+ // add segment to list
+ pairedSegments.add(segment);
+ }
+
+ return pairedSegments;
+ }
+
+ /**
+ * @param temp Temporary segment to be unified
+ * @return the segmentID given by its string representation
+ */
+ public Segment unifySegment(Segment temp) {
+ Segment found = segments.get(temp);
+ return (found != null) ? found : temp;
+ }
+
+ /**
+ * Get the number of segments
+ *
+ * @return Number of segments
+ */
+ public int size() {
+ return segments.size();
+ }
+
+ /**
+ * Get total number of pairs with or without the unclustered pairs.
+ *
+ * @param withUnclusteredPairs if false, segment with unclustered pairs is
+ * removed
+ * @return pair count, with or without unclusted (non-existant) pairs
+ */
+ public int getPairCount(boolean withUnclusteredPairs) {
+ if(withUnclusteredPairs) {
+ return (totalObjects * (totalObjects - 1)); // / 2;
+ }
+ else {
+ return actualPairs;
+ }
+ }
+
+ /**
+ * Get the number of clusterings
+ *
+ * @return number of clusterings compared
+ */
+ public int getClusterings() {
+ return clusteringsCount;
+ }
+
+ /**
+ * Return the sum of all clusters
+ *
+ * @return sum of all cluster counts
+ */
+ public int getTotalClusterCount() {
+ int clusterCount = 0;
+
+ for(int i = 0; i < numclusters.length; i++) {
+ clusterCount += numclusters[i];
+ }
+
+ return clusterCount;
+ }
+
+ /**
+ * Returns the highest number of Clusters in the clusterings
+ *
+ * @return highest cluster count
+ */
+ public int getHighestClusterCount() {
+ int maxClusters = 0;
+
+ for(int i = 0; i < numclusters.length; i++) {
+ maxClusters = Math.max(maxClusters, numclusters[i]);
+ }
+ return maxClusters;
+ }
+
+ @Override
+ public Iterator<Segment> iterator() {
+ return segments.keySet().iterator();
+ }
+} \ No newline at end of file