1 files changed, 361 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
new file mode 100644
index 00000000..97c79dfb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
@@ -0,0 +1,361 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.TreeMap;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.BasicResult;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Creates segments of two or more clusterings.
+ * 
+ * <p>
+ * Segments are the equally paired database objects of all given (2+)
+ * clusterings. Given a contingency table, an object Segment represents the
+ * table's cells where an intersection of classes and labels are given. Pair
+ * Segments are created by converting an object Segment into its pair
+ * representation. Converting all object Segments into pair Segments results in
+ * a larger number of pair Segments, if any fragmentation (no perfect match of
+ * clusters) within the contingency table has occurred (multiple cells on one
+ * row or column). Thus for ever object Segment exists a corresponding pair
+ * Segment. Additionally pair Segments represent pairs that are only in one
+ * Clustering which occurs for each split of a clusterings cluster by another
+ * clustering. Here, these pair Segments are referenced as fragmented Segments.
+ * Within the visualization they describe (at least two) pair Segments that have
+ * a corresponding object Segment.
+ * </p>
+ * 
+ * <p>
+ * Reference:<br />
+ * Evaluation of Clusterings – Metrics and Visual Support<br />
+ * Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur
+ * Zimek<br />
+ * In: Proc. 28th International Conference on Data Engineering (ICDE) 2012
+ * </p>
+ * 
+ * @author Sascha Goldhofer
+ * @author Erich Schubert
+ */
+@Reference(title = "Evaluation of Clusterings – Metrics and Visual Support", authors = "Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", booktitle = "Proc. 28th International Conference on Data Engineering (ICDE) 2012", url = "http://elki.dbs.ifi.lmu.de/wiki/PairSegments")
+public class Segments extends BasicResult implements Iterable<Segment> {
+  /**
+   * Class logger
+   */
+  private static final Logging logger = Logging.getLogger(Segments.class);
+
+  /**
+   * Clusterings
+   */
+  private List<Clustering<?>> clusterings;
+
+  /**
+   * Clusters
+   */
+  private List<List<? extends Cluster<?>>> clusters;
+
+  /**
+   * Number of clusterings in comparison
+   */
+  private int clusteringsCount;
+
+  /**
+   * Number of Clusters for each clustering
+   */
+  private int[] numclusters;
+
+  /**
+   * Total number of objects
+   */
+  private int totalObjects;
+
+  /**
+   * Pairs actually present in the data set
+   */
+  private int actualPairs;
+
+  /**
+   * The actual segments
+   */
+  private TreeMap<Segment, Segment> segments;
+
+  /**
+   * Initialize segments. Add DB objects via addObject method.
+   * 
+   * @param clusterings List of clusterings in comparison
+   * @param baseResult used to retrieve db objects
+   */
+  public Segments(List<Clustering<?>> clusterings, HierarchicalResult baseResult) {
+    super("cluster pair segments", "pair-segments");
+    this.clusterings = clusterings;
+    this.clusteringsCount = clusterings.size();
+    segments = new TreeMap<Segment, Segment>(); // TODO: replace with array list
+
+    numclusters = new int[clusteringsCount];
+    clusters = new ArrayList<List<? extends Cluster<?>>>(clusteringsCount);
+
+    // save count of clusters
+    int clusteringIndex = 0;
+    for(Clustering<?> clr : clusterings) {
+      List<? extends Cluster<?>> curClusters = clr.getAllClusters();
+      clusters.add(curClusters);
+      numclusters[clusteringIndex] = curClusters.size();
+
+      clusteringIndex++;
+    }
+
+    recursivelyFill(clusters);
+    for(Segment seg : segments.keySet()) {
+      actualPairs += seg.getPairCount();
+    }
+  }
+
+  private void recursivelyFill(List<List<? extends Cluster<?>>> cs) {
+    final int numclusterings = cs.size();
+    Iterator<? extends Cluster<?>> iter = cs.get(0).iterator();
+    int[] path = new int[numclusterings];
+    for(int cnum = 0; iter.hasNext(); cnum++) {
+      Cluster<?> clust = iter.next();
+      path[0] = cnum;
+      if(numclusterings > 1) {
+        SetDBIDs idset = DBIDUtil.ensureSet(clust.getIDs());
+        recursivelyFill(cs, 1, idset, idset, path, true);
+      }
+      else {
+        // Add to results.
+        makeOrUpdateSegment(path, clust.getIDs(), clust.size() * (clust.size() - 1));
+      }
+
+      totalObjects += clust.size();
+    }
+  }
+
+  private void recursivelyFill(List<List<? extends Cluster<?>>> cs, int depth, SetDBIDs first, SetDBIDs second, int[] path, boolean objectsegment) {
+    final int numclusterings = cs.size();
+    Iterator<? extends Cluster<?>> iter = cs.get(depth).iterator();
+    for(int cnum = 0; iter.hasNext(); cnum++) {
+      Cluster<?> clust = iter.next();
+      // Compute intersections with new cluster.
+      // nfp := intersection( first, cluster )
+      // Adding asymmetric differences to nd1, nd2.
+      // nse := intersection( second, cluster )
+      HashSetModifiableDBIDs nfirstp = DBIDUtil.newHashSet(first.size());
+      HashSetModifiableDBIDs ndelta1 = DBIDUtil.newHashSet(first);
+      HashSetModifiableDBIDs ndelta2 = DBIDUtil.newHashSet();
+      HashSetModifiableDBIDs nsecond = DBIDUtil.newHashSet(second.size());
+      for(DBID id : clust.getIDs()) {
+        if(ndelta1.remove(id)) {
+          nfirstp.add(id);
+        }
+        else {
+          ndelta2.add(id);
+        }
+        if(second.contains(id)) {
+          nsecond.add(id);
+        }
+      }
+      if(nsecond.size() <= 0) {
+        continue; // disjoint
+      }
+      if(nfirstp.size() > 0) {
+        path[depth] = cnum;
+        if(depth < numclusterings - 1) {
+          recursivelyFill(cs, depth + 1, nfirstp, nsecond, path, objectsegment);
+        }
+        else {
+          // Add to results.
+          // In fact, nfirstp should equal nsecond here
+          int selfpairs = DBIDUtil.intersection(nfirstp, nsecond).size();
+          if(objectsegment) {
+            makeOrUpdateSegment(path, nfirstp, (nfirstp.size() * nsecond.size()) - selfpairs);
+          }
+          else {
+            makeOrUpdateSegment(path, null, (nfirstp.size() * nsecond.size()) - selfpairs);
+          }
+        }
+      }
+      // Elements that were in first, but in not in the cluster
+      if(ndelta1.size() > 0) {
+        path[depth] = Segment.UNCLUSTERED;
+        if(depth < numclusterings - 1) {
+          recursivelyFill(cs, depth + 1, ndelta1, nsecond, path, false);
+        }
+        else {
+          // Add to results.
+          int selfpairs = DBIDUtil.intersection(ndelta1, nsecond).size();
+          makeOrUpdateSegment(path, null, (ndelta1.size() * nsecond.size()) - selfpairs);
+        }
+      }
+      // FIXME: this part doesn't work right yet for over 2 clusterings!
+      // It used to work in revision 9236, eventually go back to this code!
+      if(ndelta2.size() > 0 && objectsegment) {
+        int[] npath = new int[path.length];
+        Arrays.fill(npath, Segment.UNCLUSTERED);
+        npath[depth] = cnum;
+        if(depth < numclusterings - 1) {
+          recursivelyFill(cs, depth + 1, ndelta2, nsecond, npath, false);
+        }
+        else {
+          // Add to results.
+          int selfpairs = DBIDUtil.intersection(ndelta2, nsecond).size();
+          makeOrUpdateSegment(npath, null, (ndelta2.size() * nsecond.size()) - selfpairs);
+        }
+      }
+    }
+  }
+
+  private void makeOrUpdateSegment(int[] path, DBIDs ids, int pairsize) {
+    Segment seg = segments.get(new Segment(path));
+    if(seg == null) {
+      seg = new Segment(path.clone());
+      segments.put(seg, seg);
+    }
+    if(ids != null) {
+      if(seg.getDBIDs() != null) {
+        logger.warning("Expected segment to not have IDs.");
+      }
+      seg.objIDs = ids;
+    }
+    seg.pairsize += pairsize;
+  }
+
+  /**
+   * Get the description of the nth clustering.
+   * 
+   * @param clusteringID Clustering number
+   * @return long name of clustering
+   */
+  public String getClusteringDescription(int clusteringID) {
+    return clusterings.get(clusteringID).getLongName();
+  }
+
+  /**
+   * Return to a given segment with unpaired objects, the corresponding segments
+   * that result in an unpaired segment. So, one cluster of a clustering is
+   * split by another clustering in multiple segments, resulting in a segment
+   * with unpaired objects, describing the missing pairs between the split
+   * cluster / between the segments.
+   * 
+   * Basically we compare only two clusterings at once. If those clusterings do
+   * not have the whole cluster in common, we have at least three segments (two
+   * cluster), one of them containing the unpaired segment. A segmentID 3-0,
+   * describes a cluster 3 in clustering 1 (index 0) and all clusters 3-x in
+   * clustering 2. So we search for all segments 3-x (0 being a wildcard).
+   * 
+   * @param unpairedSegment
+   * @return Segments describing the set of objects that result in an unpaired
+   *         segment
+   */
+  public List<Segment> getPairedSegments(Segment unpairedSegment) {
+    ArrayList<Segment> pairedSegments = new ArrayList<Segment>();
+    // search the segments. Index at "unpairedClustering" being the wildcard.
+    segments: for(Segment segment : this) {
+      // if mismatch except at unpaired Clustering index => exclude.
+      for(int i = 0; i < clusteringsCount; i++) {
+        if(unpairedSegment.get(i) != Segment.UNCLUSTERED) {
+          // mismatch
+          if(segment.get(i) != unpairedSegment.get(i)) {
+            continue segments;
+          }
+        }
+        // do not add wildcard
+        if(segment.get(i) == Segment.UNCLUSTERED) {
+          continue segments;
+        }
+      }
+
+      // add segment to list
+      pairedSegments.add(segment);
+    }
+
+    return pairedSegments;
+  }
+
+  /**
+   * @param temp Temporary segment to be unified
+   * @return the segmentID given by its string representation
+   */
+  public Segment unifySegment(Segment temp) {
+    Segment found = segments.get(temp);
+    return (found != null) ? found : temp;
+  }
+
+  /**
+   * Get the number of segments
+   * 
+   * @return Number of segments
+   */
+  public int size() {
+    return segments.size();
+  }
+
+  /**
+   * Get total number of pairs with or without the unclustered pairs.
+   * 
+   * @param withUnclusteredPairs if false, segment with unclustered pairs is
+   *        removed
+   * @return pair count, with or without unclusted (non-existant) pairs
+   */
+  public int getPairCount(boolean withUnclusteredPairs) {
+    if(withUnclusteredPairs) {
+      return (totalObjects * (totalObjects - 1)); // / 2;
+    }
+    else {
+      return actualPairs;
+    }
+  }
+
+  /**
+   * Get the number of clusterings
+   * 
+   * @return number of clusterings compared
+   */
+  public int getClusterings() {
+    return clusteringsCount;
+  }
+
+  /**
+   * Return the sum of all clusters
+   * 
+   * @return sum of all cluster counts
+   */
+  public int getTotalClusterCount() {
+    int clusterCount = 0;
+
+    for(int i = 0; i < numclusters.length; i++) {
+      clusterCount += numclusters[i];
+    }
+
+    return clusterCount;
+  }
+
+  /**
+   * Returns the highest number of Clusters in the clusterings
+   * 
+   * @return highest cluster count
+   */
+  public int getHighestClusterCount() {
+    int maxClusters = 0;
+
+    for(int i = 0; i < numclusters.length; i++) {
+      maxClusters = Math.max(maxClusters, numclusters[i]);
+    }
+    return maxClusters;
+  }
+
+  @Override
+  public Iterator<Segment> iterator() {
+    return segments.keySet().iterator();
+  }
+}
+\ No newline at end of file