4 files changed, 583 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java
new file mode 100644
index 00000000..2471b135
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/ClusterPairSegmentAnalysis.java
@@ -0,0 +1,47 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.evaluation.Evaluator;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.result.Result;
+import de.lmu.ifi.dbs.elki.result.ResultUtil;
+
+/**
+ * Evaluate clustering results by building segments for their pairs: shared
+ * pairs and differences.
+ * 
+ * @author Sascha Goldhofer
+ * @author Erich Schubert
+ * 
+ * @apiviz.uses Clustering
+ * @apiviz.uses Segments
+ */
+public class ClusterPairSegmentAnalysis implements Evaluator {
+  /**
+   * Constructor.
+   */
+  public ClusterPairSegmentAnalysis() {
+    super();
+  }
+
+  /**
+   * Perform clusterings evaluation
+   */
+  @Override
+  public void processNewResult(HierarchicalResult baseResult, Result result) {
+    // Get all new clusterings
+    // TODO: handle clusterings added later, too. Can we update the result?
+    
+    List<Clustering<?>> clusterings = ResultUtil.getClusteringResults(result);
+    // Abort if not enough clusterings to compare
+    if(clusterings.size() < 2) {
+      return;
+    }
+
+    // create segments
+    Segments segments = new Segments(clusterings, baseResult);
+    baseResult.getHierarchy().add(result, segments);
+  }
+}
+\ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java
new file mode 100644
index 00000000..8ce8fd4a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segment.java
@@ -0,0 +1,149 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+
+/**
+ * A segment represents a set of pairs that share the same clustering
+ * properties.
+ * 
+ * As such, for each ring (= clustering), a cluster number (or the constant
+ * {@link #UNCLUSTERED}) is stored.
+ */
+public class Segment implements Comparable<Segment> {
+  /**
+   * Object is not clustered
+   */
+  public static final int UNCLUSTERED = -1;
+
+  /**
+   * IDs in segment, for object segments.
+   */
+  protected DBIDs objIDs = null;
+
+  /**
+   * Size of cluster, in pairs.
+   */
+  protected long pairsize = 0;
+
+  /**
+   * The cluster numbers in each ring
+   */
+  protected int[] clusterIds;
+
+  public Segment(int clusterings) {
+    clusterIds = new int[clusterings];
+  }
+
+  public long getPairCount() {
+    return pairsize;
+  }
+
+  /**
+   * Constructor.
+   * 
+   * @param clone Clone of cluster ids
+   */
+  public Segment(int[] clone) {
+    clusterIds = clone;
+  }
+
+  /**
+   * Get cluster number for index idx.
+   * 
+   * @param idx Index
+   * @return Cluster number
+   */
+  public int get(int idx) {
+    return clusterIds[idx];
+  }
+
+  /**
+   * Checks if the segment has a cluster with unpaired objects. Unpaired
+   * clusters are represented by "0" (0 = all).
+   * 
+   * @return true when unclustered in at least one dimension.
+   */
+  public boolean isUnpaired() {
+    for(int id : clusterIds) {
+      if(id == UNCLUSTERED) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Check if this segment contains the pairs that are never clustered by any of
+   * the clusterings (all 0).
+   * 
+   * @return true when unclustered everywhere
+   */
+  public boolean isNone() {
+    for(int id : clusterIds) {
+      if(id != UNCLUSTERED) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Returns the index of the first clustering having an unpaired cluster, or -1
+   * no unpaired cluster exists.
+   * 
+   * @return clustering id or -1
+   */
+  public int getUnpairedClusteringIndex() {
+    for(int index = 0; index < clusterIds.length; index++) {
+      if(clusterIds[index] == UNCLUSTERED) {
+        return index;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Get the DBIDs of objects contained in this segment.
+   * 
+   * @return the segment IDs
+   */
+  public DBIDs getDBIDs() {
+    return objIDs;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if(!(Segment.class.isInstance(obj))) {
+      return false;
+    }
+    Segment other = (Segment) obj;
+    return Arrays.equals(clusterIds, other.clusterIds);
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(clusterIds);
+  }
+
+  @Override
+  public int compareTo(Segment sid) {
+    for(int i = 0; i < clusterIds.length; i++) {
+      final int a = this.clusterIds[i];
+      final int b = sid.clusterIds[i];
+      if(a != b) {
+        if(a * b > 0) {
+          // Regular comparison
+          return (a < b) ? -1 : +1;
+          // return (a < b) ? +1 : -1;
+        }
+        else {
+          // Inverse, to sort negative last
+          return (a < b) ? +1 : -1;
+        }
+      }
+    }
+    return 0;
+  }
+}
+\ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
new file mode 100644
index 00000000..97c79dfb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java
@@ -0,0 +1,361 @@
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.TreeMap;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.BasicResult;
+import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+
+/**
+ * Creates segments of two or more clusterings.
+ * 
+ * <p>
+ * Segments are the equally paired database objects of all given (2+)
+ * clusterings. Given a contingency table, an object Segment represents the
+ * table's cells where an intersection of classes and labels are given. Pair
+ * Segments are created by converting an object Segment into its pair
+ * representation. Converting all object Segments into pair Segments results in
+ * a larger number of pair Segments, if any fragmentation (no perfect match of
+ * clusters) within the contingency table has occurred (multiple cells on one
+ * row or column). Thus for ever object Segment exists a corresponding pair
+ * Segment. Additionally pair Segments represent pairs that are only in one
+ * Clustering which occurs for each split of a clusterings cluster by another
+ * clustering. Here, these pair Segments are referenced as fragmented Segments.
+ * Within the visualization they describe (at least two) pair Segments that have
+ * a corresponding object Segment.
+ * </p>
+ * 
+ * <p>
+ * Reference:<br />
+ * Evaluation of Clusterings – Metrics and Visual Support<br />
+ * Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur
+ * Zimek<br />
+ * In: Proc. 28th International Conference on Data Engineering (ICDE) 2012
+ * </p>
+ * 
+ * @author Sascha Goldhofer
+ * @author Erich Schubert
+ */
+@Reference(title = "Evaluation of Clusterings – Metrics and Visual Support", authors = "Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", booktitle = "Proc. 28th International Conference on Data Engineering (ICDE) 2012", url = "http://elki.dbs.ifi.lmu.de/wiki/PairSegments")
+public class Segments extends BasicResult implements Iterable<Segment> {
+  /**
+   * Class logger
+   */
+  private static final Logging logger = Logging.getLogger(Segments.class);
+
+  /**
+   * Clusterings
+   */
+  private List<Clustering<?>> clusterings;
+
+  /**
+   * Clusters
+   */
+  private List<List<? extends Cluster<?>>> clusters;
+
+  /**
+   * Number of clusterings in comparison
+   */
+  private int clusteringsCount;
+
+  /**
+   * Number of Clusters for each clustering
+   */
+  private int[] numclusters;
+
+  /**
+   * Total number of objects
+   */
+  private int totalObjects;
+
+  /**
+   * Pairs actually present in the data set
+   */
+  private int actualPairs;
+
+  /**
+   * The actual segments
+   */
+  private TreeMap<Segment, Segment> segments;
+
+  /**
+   * Initialize segments. Add DB objects via addObject method.
+   * 
+   * @param clusterings List of clusterings in comparison
+   * @param baseResult used to retrieve db objects
+   */
+  public Segments(List<Clustering<?>> clusterings, HierarchicalResult baseResult) {
+    super("cluster pair segments", "pair-segments");
+    this.clusterings = clusterings;
+    this.clusteringsCount = clusterings.size();
+    segments = new TreeMap<Segment, Segment>(); // TODO: replace with array list
+
+    numclusters = new int[clusteringsCount];
+    clusters = new ArrayList<List<? extends Cluster<?>>>(clusteringsCount);
+
+    // save count of clusters
+    int clusteringIndex = 0;
+    for(Clustering<?> clr : clusterings) {
+      List<? extends Cluster<?>> curClusters = clr.getAllClusters();
+      clusters.add(curClusters);
+      numclusters[clusteringIndex] = curClusters.size();
+
+      clusteringIndex++;
+    }
+
+    recursivelyFill(clusters);
+    for(Segment seg : segments.keySet()) {
+      actualPairs += seg.getPairCount();
+    }
+  }
+
+  private void recursivelyFill(List<List<? extends Cluster<?>>> cs) {
+    final int numclusterings = cs.size();
+    Iterator<? extends Cluster<?>> iter = cs.get(0).iterator();
+    int[] path = new int[numclusterings];
+    for(int cnum = 0; iter.hasNext(); cnum++) {
+      Cluster<?> clust = iter.next();
+      path[0] = cnum;
+      if(numclusterings > 1) {
+        SetDBIDs idset = DBIDUtil.ensureSet(clust.getIDs());
+        recursivelyFill(cs, 1, idset, idset, path, true);
+      }
+      else {
+        // Add to results.
+        makeOrUpdateSegment(path, clust.getIDs(), clust.size() * (clust.size() - 1));
+      }
+
+      totalObjects += clust.size();
+    }
+  }
+
+  private void recursivelyFill(List<List<? extends Cluster<?>>> cs, int depth, SetDBIDs first, SetDBIDs second, int[] path, boolean objectsegment) {
+    final int numclusterings = cs.size();
+    Iterator<? extends Cluster<?>> iter = cs.get(depth).iterator();
+    for(int cnum = 0; iter.hasNext(); cnum++) {
+      Cluster<?> clust = iter.next();
+      // Compute intersections with new cluster.
+      // nfp := intersection( first, cluster )
+      // Adding asymmetric differences to nd1, nd2.
+      // nse := intersection( second, cluster )
+      HashSetModifiableDBIDs nfirstp = DBIDUtil.newHashSet(first.size());
+      HashSetModifiableDBIDs ndelta1 = DBIDUtil.newHashSet(first);
+      HashSetModifiableDBIDs ndelta2 = DBIDUtil.newHashSet();
+      HashSetModifiableDBIDs nsecond = DBIDUtil.newHashSet(second.size());
+      for(DBID id : clust.getIDs()) {
+        if(ndelta1.remove(id)) {
+          nfirstp.add(id);
+        }
+        else {
+          ndelta2.add(id);
+        }
+        if(second.contains(id)) {
+          nsecond.add(id);
+        }
+      }
+      if(nsecond.size() <= 0) {
+        continue; // disjoint
+      }
+      if(nfirstp.size() > 0) {
+        path[depth] = cnum;
+        if(depth < numclusterings - 1) {
+          recursivelyFill(cs, depth + 1, nfirstp, nsecond, path, objectsegment);
+        }
+        else {
+          // Add to results.
+          // In fact, nfirstp should equal nsecond here
+          int selfpairs = DBIDUtil.intersection(nfirstp, nsecond).size();
+          if(objectsegment) {
+            makeOrUpdateSegment(path, nfirstp, (nfirstp.size() * nsecond.size()) - selfpairs);
+          }
+          else {
+            makeOrUpdateSegment(path, null, (nfirstp.size() * nsecond.size()) - selfpairs);
+          }
+        }
+      }
+      // Elements that were in first, but in not in the cluster
+      if(ndelta1.size() > 0) {
+        path[depth] = Segment.UNCLUSTERED;
+        if(depth < numclusterings - 1) {
+          recursivelyFill(cs, depth + 1, ndelta1, nsecond, path, false);
+        }
+        else {
+          // Add to results.
+          int selfpairs = DBIDUtil.intersection(ndelta1, nsecond).size();
+          makeOrUpdateSegment(path, null, (ndelta1.size() * nsecond.size()) - selfpairs);
+        }
+      }
+      // FIXME: this part doesn't work right yet for over 2 clusterings!
+      // It used to work in revision 9236, eventually go back to this code!
+      if(ndelta2.size() > 0 && objectsegment) {
+        int[] npath = new int[path.length];
+        Arrays.fill(npath, Segment.UNCLUSTERED);
+        npath[depth] = cnum;
+        if(depth < numclusterings - 1) {
+          recursivelyFill(cs, depth + 1, ndelta2, nsecond, npath, false);
+        }
+        else {
+          // Add to results.
+          int selfpairs = DBIDUtil.intersection(ndelta2, nsecond).size();
+          makeOrUpdateSegment(npath, null, (ndelta2.size() * nsecond.size()) - selfpairs);
+        }
+      }
+    }
+  }
+
+  private void makeOrUpdateSegment(int[] path, DBIDs ids, int pairsize) {
+    Segment seg = segments.get(new Segment(path));
+    if(seg == null) {
+      seg = new Segment(path.clone());
+      segments.put(seg, seg);
+    }
+    if(ids != null) {
+      if(seg.getDBIDs() != null) {
+        logger.warning("Expected segment to not have IDs.");
+      }
+      seg.objIDs = ids;
+    }
+    seg.pairsize += pairsize;
+  }
+
+  /**
+   * Get the description of the nth clustering.
+   * 
+   * @param clusteringID Clustering number
+   * @return long name of clustering
+   */
+  public String getClusteringDescription(int clusteringID) {
+    return clusterings.get(clusteringID).getLongName();
+  }
+
+  /**
+   * Return to a given segment with unpaired objects, the corresponding segments
+   * that result in an unpaired segment. So, one cluster of a clustering is
+   * split by another clustering in multiple segments, resulting in a segment
+   * with unpaired objects, describing the missing pairs between the split
+   * cluster / between the segments.
+   * 
+   * Basically we compare only two clusterings at once. If those clusterings do
+   * not have the whole cluster in common, we have at least three segments (two
+   * cluster), one of them containing the unpaired segment. A segmentID 3-0,
+   * describes a cluster 3 in clustering 1 (index 0) and all clusters 3-x in
+   * clustering 2. So we search for all segments 3-x (0 being a wildcard).
+   * 
+   * @param unpairedSegment
+   * @return Segments describing the set of objects that result in an unpaired
+   *         segment
+   */
+  public List<Segment> getPairedSegments(Segment unpairedSegment) {
+    ArrayList<Segment> pairedSegments = new ArrayList<Segment>();
+    // search the segments. Index at "unpairedClustering" being the wildcard.
+    segments: for(Segment segment : this) {
+      // if mismatch except at unpaired Clustering index => exclude.
+      for(int i = 0; i < clusteringsCount; i++) {
+        if(unpairedSegment.get(i) != Segment.UNCLUSTERED) {
+          // mismatch
+          if(segment.get(i) != unpairedSegment.get(i)) {
+            continue segments;
+          }
+        }
+        // do not add wildcard
+        if(segment.get(i) == Segment.UNCLUSTERED) {
+          continue segments;
+        }
+      }
+
+      // add segment to list
+      pairedSegments.add(segment);
+    }
+
+    return pairedSegments;
+  }
+
+  /**
+   * @param temp Temporary segment to be unified
+   * @return the segmentID given by its string representation
+   */
+  public Segment unifySegment(Segment temp) {
+    Segment found = segments.get(temp);
+    return (found != null) ? found : temp;
+  }
+
+  /**
+   * Get the number of segments
+   * 
+   * @return Number of segments
+   */
+  public int size() {
+    return segments.size();
+  }
+
+  /**
+   * Get total number of pairs with or without the unclustered pairs.
+   * 
+   * @param withUnclusteredPairs if false, segment with unclustered pairs is
+   *        removed
+   * @return pair count, with or without unclusted (non-existant) pairs
+   */
+  public int getPairCount(boolean withUnclusteredPairs) {
+    if(withUnclusteredPairs) {
+      return (totalObjects * (totalObjects - 1)); // / 2;
+    }
+    else {
+      return actualPairs;
+    }
+  }
+
+  /**
+   * Get the number of clusterings
+   * 
+   * @return number of clusterings compared
+   */
+  public int getClusterings() {
+    return clusteringsCount;
+  }
+
+  /**
+   * Return the sum of all clusters
+   * 
+   * @return sum of all cluster counts
+   */
+  public int getTotalClusterCount() {
+    int clusterCount = 0;
+
+    for(int i = 0; i < numclusters.length; i++) {
+      clusterCount += numclusters[i];
+    }
+
+    return clusterCount;
+  }
+
+  /**
+   * Returns the highest number of Clusters in the clusterings
+   * 
+   * @return highest cluster count
+   */
+  public int getHighestClusterCount() {
+    int maxClusters = 0;
+
+    for(int i = 0; i < numclusters.length; i++) {
+      maxClusters = Math.max(maxClusters, numclusters[i]);
+    }
+    return maxClusters;
+  }
+
+  @Override
+  public Iterator<Segment> iterator() {
+    return segments.keySet().iterator();
+  }
+}
+\ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java
new file mode 100644
index 00000000..27176ff6
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>Pair-segment analysis of multiple clusterings.</p>
+ */
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments;
+\ No newline at end of file