diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java new file mode 100644 index 00000000..97c79dfb --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/evaluation/clustering/pairsegments/Segments.java @@ -0,0 +1,361 @@ +package de.lmu.ifi.dbs.elki.evaluation.clustering.pairsegments; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.TreeMap; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.result.BasicResult; +import de.lmu.ifi.dbs.elki.result.HierarchicalResult; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/** + * Creates segments of two or more clusterings. + * + * <p> + * Segments are the equally paired database objects of all given (2+) + * clusterings. Given a contingency table, an object Segment represents the + * table's cells where an intersection of classes and labels are given. Pair + * Segments are created by converting an object Segment into its pair + * representation. Converting all object Segments into pair Segments results in + * a larger number of pair Segments, if any fragmentation (no perfect match of + * clusters) within the contingency table has occurred (multiple cells on one + * row or column). Thus for ever object Segment exists a corresponding pair + * Segment. Additionally pair Segments represent pairs that are only in one + * Clustering which occurs for each split of a clusterings cluster by another + * clustering. Here, these pair Segments are referenced as fragmented Segments. + * Within the visualization they describe (at least two) pair Segments that have + * a corresponding object Segment. + * </p> + * + * <p> + * Reference:<br /> + * Evaluation of Clusterings – Metrics and Visual Support<br /> + * Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur + * Zimek<br /> + * In: Proc. 28th International Conference on Data Engineering (ICDE) 2012 + * </p> + * + * @author Sascha Goldhofer + * @author Erich Schubert + */ +@Reference(title = "Evaluation of Clusterings – Metrics and Visual Support", authors = "Elke Achtert, Sascha Goldhofer, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", booktitle = "Proc. 28th International Conference on Data Engineering (ICDE) 2012", url = "http://elki.dbs.ifi.lmu.de/wiki/PairSegments") +public class Segments extends BasicResult implements Iterable<Segment> { + /** + * Class logger + */ + private static final Logging logger = Logging.getLogger(Segments.class); + + /** + * Clusterings + */ + private List<Clustering<?>> clusterings; + + /** + * Clusters + */ + private List<List<? extends Cluster<?>>> clusters; + + /** + * Number of clusterings in comparison + */ + private int clusteringsCount; + + /** + * Number of Clusters for each clustering + */ + private int[] numclusters; + + /** + * Total number of objects + */ + private int totalObjects; + + /** + * Pairs actually present in the data set + */ + private int actualPairs; + + /** + * The actual segments + */ + private TreeMap<Segment, Segment> segments; + + /** + * Initialize segments. Add DB objects via addObject method. + * + * @param clusterings List of clusterings in comparison + * @param baseResult used to retrieve db objects + */ + public Segments(List<Clustering<?>> clusterings, HierarchicalResult baseResult) { + super("cluster pair segments", "pair-segments"); + this.clusterings = clusterings; + this.clusteringsCount = clusterings.size(); + segments = new TreeMap<Segment, Segment>(); // TODO: replace with array list + + numclusters = new int[clusteringsCount]; + clusters = new ArrayList<List<? extends Cluster<?>>>(clusteringsCount); + + // save count of clusters + int clusteringIndex = 0; + for(Clustering<?> clr : clusterings) { + List<? extends Cluster<?>> curClusters = clr.getAllClusters(); + clusters.add(curClusters); + numclusters[clusteringIndex] = curClusters.size(); + + clusteringIndex++; + } + + recursivelyFill(clusters); + for(Segment seg : segments.keySet()) { + actualPairs += seg.getPairCount(); + } + } + + private void recursivelyFill(List<List<? extends Cluster<?>>> cs) { + final int numclusterings = cs.size(); + Iterator<? extends Cluster<?>> iter = cs.get(0).iterator(); + int[] path = new int[numclusterings]; + for(int cnum = 0; iter.hasNext(); cnum++) { + Cluster<?> clust = iter.next(); + path[0] = cnum; + if(numclusterings > 1) { + SetDBIDs idset = DBIDUtil.ensureSet(clust.getIDs()); + recursivelyFill(cs, 1, idset, idset, path, true); + } + else { + // Add to results. + makeOrUpdateSegment(path, clust.getIDs(), clust.size() * (clust.size() - 1)); + } + + totalObjects += clust.size(); + } + } + + private void recursivelyFill(List<List<? extends Cluster<?>>> cs, int depth, SetDBIDs first, SetDBIDs second, int[] path, boolean objectsegment) { + final int numclusterings = cs.size(); + Iterator<? extends Cluster<?>> iter = cs.get(depth).iterator(); + for(int cnum = 0; iter.hasNext(); cnum++) { + Cluster<?> clust = iter.next(); + // Compute intersections with new cluster. + // nfp := intersection( first, cluster ) + // Adding asymmetric differences to nd1, nd2. + // nse := intersection( second, cluster ) + HashSetModifiableDBIDs nfirstp = DBIDUtil.newHashSet(first.size()); + HashSetModifiableDBIDs ndelta1 = DBIDUtil.newHashSet(first); + HashSetModifiableDBIDs ndelta2 = DBIDUtil.newHashSet(); + HashSetModifiableDBIDs nsecond = DBIDUtil.newHashSet(second.size()); + for(DBID id : clust.getIDs()) { + if(ndelta1.remove(id)) { + nfirstp.add(id); + } + else { + ndelta2.add(id); + } + if(second.contains(id)) { + nsecond.add(id); + } + } + if(nsecond.size() <= 0) { + continue; // disjoint + } + if(nfirstp.size() > 0) { + path[depth] = cnum; + if(depth < numclusterings - 1) { + recursivelyFill(cs, depth + 1, nfirstp, nsecond, path, objectsegment); + } + else { + // Add to results. + // In fact, nfirstp should equal nsecond here + int selfpairs = DBIDUtil.intersection(nfirstp, nsecond).size(); + if(objectsegment) { + makeOrUpdateSegment(path, nfirstp, (nfirstp.size() * nsecond.size()) - selfpairs); + } + else { + makeOrUpdateSegment(path, null, (nfirstp.size() * nsecond.size()) - selfpairs); + } + } + } + // Elements that were in first, but in not in the cluster + if(ndelta1.size() > 0) { + path[depth] = Segment.UNCLUSTERED; + if(depth < numclusterings - 1) { + recursivelyFill(cs, depth + 1, ndelta1, nsecond, path, false); + } + else { + // Add to results. + int selfpairs = DBIDUtil.intersection(ndelta1, nsecond).size(); + makeOrUpdateSegment(path, null, (ndelta1.size() * nsecond.size()) - selfpairs); + } + } + // FIXME: this part doesn't work right yet for over 2 clusterings! + // It used to work in revision 9236, eventually go back to this code! + if(ndelta2.size() > 0 && objectsegment) { + int[] npath = new int[path.length]; + Arrays.fill(npath, Segment.UNCLUSTERED); + npath[depth] = cnum; + if(depth < numclusterings - 1) { + recursivelyFill(cs, depth + 1, ndelta2, nsecond, npath, false); + } + else { + // Add to results. + int selfpairs = DBIDUtil.intersection(ndelta2, nsecond).size(); + makeOrUpdateSegment(npath, null, (ndelta2.size() * nsecond.size()) - selfpairs); + } + } + } + } + + private void makeOrUpdateSegment(int[] path, DBIDs ids, int pairsize) { + Segment seg = segments.get(new Segment(path)); + if(seg == null) { + seg = new Segment(path.clone()); + segments.put(seg, seg); + } + if(ids != null) { + if(seg.getDBIDs() != null) { + logger.warning("Expected segment to not have IDs."); + } + seg.objIDs = ids; + } + seg.pairsize += pairsize; + } + + /** + * Get the description of the nth clustering. + * + * @param clusteringID Clustering number + * @return long name of clustering + */ + public String getClusteringDescription(int clusteringID) { + return clusterings.get(clusteringID).getLongName(); + } + + /** + * Return to a given segment with unpaired objects, the corresponding segments + * that result in an unpaired segment. So, one cluster of a clustering is + * split by another clustering in multiple segments, resulting in a segment + * with unpaired objects, describing the missing pairs between the split + * cluster / between the segments. + * + * Basically we compare only two clusterings at once. If those clusterings do + * not have the whole cluster in common, we have at least three segments (two + * cluster), one of them containing the unpaired segment. A segmentID 3-0, + * describes a cluster 3 in clustering 1 (index 0) and all clusters 3-x in + * clustering 2. So we search for all segments 3-x (0 being a wildcard). + * + * @param unpairedSegment + * @return Segments describing the set of objects that result in an unpaired + * segment + */ + public List<Segment> getPairedSegments(Segment unpairedSegment) { + ArrayList<Segment> pairedSegments = new ArrayList<Segment>(); + // search the segments. Index at "unpairedClustering" being the wildcard. + segments: for(Segment segment : this) { + // if mismatch except at unpaired Clustering index => exclude. + for(int i = 0; i < clusteringsCount; i++) { + if(unpairedSegment.get(i) != Segment.UNCLUSTERED) { + // mismatch + if(segment.get(i) != unpairedSegment.get(i)) { + continue segments; + } + } + // do not add wildcard + if(segment.get(i) == Segment.UNCLUSTERED) { + continue segments; + } + } + + // add segment to list + pairedSegments.add(segment); + } + + return pairedSegments; + } + + /** + * @param temp Temporary segment to be unified + * @return the segmentID given by its string representation + */ + public Segment unifySegment(Segment temp) { + Segment found = segments.get(temp); + return (found != null) ? found : temp; + } + + /** + * Get the number of segments + * + * @return Number of segments + */ + public int size() { + return segments.size(); + } + + /** + * Get total number of pairs with or without the unclustered pairs. + * + * @param withUnclusteredPairs if false, segment with unclustered pairs is + * removed + * @return pair count, with or without unclusted (non-existant) pairs + */ + public int getPairCount(boolean withUnclusteredPairs) { + if(withUnclusteredPairs) { + return (totalObjects * (totalObjects - 1)); // / 2; + } + else { + return actualPairs; + } + } + + /** + * Get the number of clusterings + * + * @return number of clusterings compared + */ + public int getClusterings() { + return clusteringsCount; + } + + /** + * Return the sum of all clusters + * + * @return sum of all cluster counts + */ + public int getTotalClusterCount() { + int clusterCount = 0; + + for(int i = 0; i < numclusters.length; i++) { + clusterCount += numclusters[i]; + } + + return clusterCount; + } + + /** + * Returns the highest number of Clusters in the clusterings + * + * @return highest cluster count + */ + public int getHighestClusterCount() { + int maxClusters = 0; + + for(int i = 0; i < numclusters.length; i++) { + maxClusters = Math.max(maxClusters, numclusters[i]); + } + return maxClusters; + } + + @Override + public Iterator<Segment> iterator() { + return segments.keySet().iterator(); + } +}
\ No newline at end of file |