diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java | 156 |
1 files changed, 79 insertions, 77 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index 01a693e4..37b3eb57 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -43,13 +43,13 @@ import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -57,7 +57,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.IntervalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -96,11 +96,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications") @Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.") @Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314") -public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(CLIQUE.class); + private static final Logging LOG = Logging.getLogger(CLIQUE.class); /** * Parameter to specify the number of intervals (units) in each dimension, @@ -109,7 +109,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -clique.xsi} * </p> */ - public static final OptionID XSI_ID = OptionID.getOrCreateOptionID("clique.xsi", "The number of intervals (units) in each dimension."); + public static final OptionID XSI_ID = new OptionID("clique.xsi", "The number of intervals (units) in each dimension."); /** * Parameter to specify the density threshold for the selectivity of a unit, @@ -119,7 +119,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -clique.tau} * </p> */ - public static final OptionID TAU_ID = OptionID.getOrCreateOptionID("clique.tau", "The density threshold for the selectivity of a unit, where the selectivity is" + "the fraction of total feature vectors contained in this unit."); + public static final OptionID TAU_ID = new OptionID("clique.tau", "The density threshold for the selectivity of a unit, where the selectivity is" + "the fraction of total feature vectors contained in this unit."); /** * Flag to indicate that only subspaces with large coverage (i.e. the fraction @@ -129,7 +129,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -clique.prune} * </p> */ - public static final OptionID PRUNE_ID = OptionID.getOrCreateOptionID("clique.prune", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned."); + public static final OptionID PRUNE_ID = new OptionID("clique.prune", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned."); /** * Holds the value of {@link #XSI_ID}. @@ -169,53 +169,53 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus public Clustering<SubspaceModel<V>> run(Relation<V> relation) { // 1. Identification of subspaces that contain clusters // TODO: use step logging. - if(logger.isVerbose()) { - logger.verbose("*** 1. Identification of subspaces that contain clusters ***"); + if(LOG.isVerbose()) { + LOG.verbose("*** 1. Identification of subspaces that contain clusters ***"); } SortedMap<Integer, List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new TreeMap<Integer, List<CLIQUESubspace<V>>>(); List<CLIQUESubspace<V>> denseSubspaces = findOneDimensionalDenseSubspaces(relation); - dimensionToDenseSubspaces.put(0, denseSubspaces); - if(logger.isVerbose()) { - logger.verbose(" 1-dimensional dense subspaces: " + denseSubspaces.size()); + dimensionToDenseSubspaces.put(Integer.valueOf(0), denseSubspaces); + if(LOG.isVerbose()) { + LOG.verbose(" 1-dimensional dense subspaces: " + denseSubspaces.size()); } - if(logger.isDebugging()) { + if(LOG.isDebugging()) { for(CLIQUESubspace<V> s : denseSubspaces) { - logger.debug(s.toString(" ")); + LOG.debug(s.toString(" ")); } } - int dimensionality = DatabaseUtil.dimensionality(relation); + int dimensionality = RelationUtil.dimensionality(relation); for(int k = 2; k <= dimensionality && !denseSubspaces.isEmpty(); k++) { denseSubspaces = findDenseSubspaces(relation, denseSubspaces); - dimensionToDenseSubspaces.put(k - 1, denseSubspaces); - if(logger.isVerbose()) { - logger.verbose(" " + k + "-dimensional dense subspaces: " + denseSubspaces.size()); + dimensionToDenseSubspaces.put(Integer.valueOf(k - 1), denseSubspaces); + if(LOG.isVerbose()) { + LOG.verbose(" " + k + "-dimensional dense subspaces: " + denseSubspaces.size()); } - if(logger.isDebugging()) { + if(LOG.isDebugging()) { for(CLIQUESubspace<V> s : denseSubspaces) { - logger.debug(s.toString(" ")); + LOG.debug(s.toString(" ")); } } } // 2. Identification of clusters - if(logger.isVerbose()) { - logger.verbose("*** 2. Identification of clusters ***"); + if(LOG.isVerbose()) { + LOG.verbose("*** 2. Identification of clusters ***"); } // build result int numClusters = 1; Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("CLIQUE clustering", "clique-clustering"); for(Integer dim : dimensionToDenseSubspaces.keySet()) { List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim); - List<Pair<Subspace<V>, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces); + List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces); - if(logger.isVerbose()) { - logger.verbose(" " + (dim + 1) + "-dimensional clusters: " + modelsAndClusters.size()); + if(LOG.isVerbose()) { + LOG.verbose(" " + (dim + 1) + "-dimensional clusters: " + modelsAndClusters.size()); } - for(Pair<Subspace<V>, ModifiableDBIDs> modelAndCluster : modelsAndClusters) { + for(Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) { Cluster<SubspaceModel<V>> newCluster = new Cluster<SubspaceModel<V>>(modelAndCluster.second); - newCluster.setModel(new SubspaceModel<V>(modelAndCluster.first, DatabaseUtil.centroid(relation, modelAndCluster.second))); + newCluster.setModel(new SubspaceModel<V>(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).toVector(relation))); newCluster.setName("cluster_" + numClusters++); result.addCluster(newCluster); } @@ -232,13 +232,13 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @return the clusters in the specified dense subspaces and the corresponding * cluster models */ - private List<Pair<Subspace<V>, ModifiableDBIDs>> determineClusters(List<CLIQUESubspace<V>> denseSubspaces) { - List<Pair<Subspace<V>, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace<V>, ModifiableDBIDs>>(); + private List<Pair<Subspace, ModifiableDBIDs>> determineClusters(List<CLIQUESubspace<V>> denseSubspaces) { + List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace, ModifiableDBIDs>>(); for(CLIQUESubspace<V> subspace : denseSubspaces) { - List<Pair<Subspace<V>, ModifiableDBIDs>> clustersInSubspace = subspace.determineClusters(); - if(logger.isDebugging()) { - logger.debugFine("Subspace " + subspace + " clusters " + clustersInSubspace.size()); + List<Pair<Subspace, ModifiableDBIDs>> clustersInSubspace = subspace.determineClusters(); + if(LOG.isDebugging()) { + LOG.debugFine("Subspace " + subspace + " clusters " + clustersInSubspace.size()); } clusters.addAll(clustersInSubspace); } @@ -289,7 +289,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @return the created one dimensional units */ private Collection<CLIQUEUnit<V>> initOneDimensionalUnits(Relation<V> database) { - int dimensionality = DatabaseUtil.dimensionality(database); + int dimensionality = RelationUtil.dimensionality(database); // initialize minima and maxima double[] minima = new double[dimensionality]; double[] maxima = new double[dimensionality]; @@ -312,12 +312,12 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus unit_lengths[d] = (maxima[d] - minima[d]) / xsi; } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); msg.append(" minima: ").append(FormatUtil.format(minima, ", ", 2)); msg.append("\n maxima: ").append(FormatUtil.format(maxima, ", ", 2)); msg.append("\n unit lengths: ").append(FormatUtil.format(unit_lengths, ", ", 2)); - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } // determine the boundaries of the units @@ -332,10 +332,10 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus } } } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); msg.append(" unit bounds ").append(FormatUtil.format(new Matrix(unit_bounds), " ")); - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } // build the 1 dimensional units @@ -346,10 +346,10 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus } } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); msg.append(" total number of 1-dim units: ").append(units.size()); - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } return units; @@ -367,12 +367,12 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus if(minima.length != featureVector.getDimensionality()) { throw new IllegalArgumentException("FeatureVectors differ in length."); } - for(int d = 1; d <= featureVector.getDimensionality(); d++) { - if((featureVector.doubleValue(d)) > maxima[d - 1]) { - maxima[d - 1] = (featureVector.doubleValue(d)); + for(int d = 0; d < featureVector.getDimensionality(); d++) { + if((featureVector.doubleValue(d)) > maxima[d]) { + maxima[d] = (featureVector.doubleValue(d)); } - if((featureVector.doubleValue(d)) < minima[d - 1]) { - minima[d - 1] = (featureVector.doubleValue(d)); + if((featureVector.doubleValue(d)) < minima[d]) { + minima[d] = (featureVector.doubleValue(d)); } } } @@ -387,38 +387,37 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus */ private List<CLIQUESubspace<V>> findOneDimensionalDenseSubspaceCandidates(Relation<V> database) { Collection<CLIQUEUnit<V>> units = initOneDimensionalUnits(database); - Collection<CLIQUEUnit<V>> denseUnits = new ArrayList<CLIQUEUnit<V>>(); - Map<Integer, CLIQUESubspace<V>> denseSubspaces = new HashMap<Integer, CLIQUESubspace<V>>(); - // identify dense units double total = database.size(); - for(DBIDIter it = database.iterDBIDs(); it.valid();) { + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { V featureVector = database.get(it); - final DBID id = it.getDBID(); - it.advance(); for(CLIQUEUnit<V> unit : units) { - unit.addFeatureVector(id, featureVector); - // unit is a dense unit - // FIXME: why it.valid()? - if(!it.valid() && unit.selectivity(total) >= tau) { - denseUnits.add(unit); - // add the dense unit to its subspace - int dim = unit.getIntervals().iterator().next().getDimension(); - CLIQUESubspace<V> subspace_d = denseSubspaces.get(dim); - if(subspace_d == null) { - subspace_d = new CLIQUESubspace<V>(dim); - denseSubspaces.put(dim, subspace_d); - } - subspace_d.addDenseUnit(unit); + unit.addFeatureVector(it, featureVector); + } + } + + Collection<CLIQUEUnit<V>> denseUnits = new ArrayList<CLIQUEUnit<V>>(); + Map<Integer, CLIQUESubspace<V>> denseSubspaces = new HashMap<Integer, CLIQUESubspace<V>>(); + for(CLIQUEUnit<V> unit : units) { + // unit is a dense unit + if(unit.selectivity(total) >= tau) { + denseUnits.add(unit); + // add the dense unit to its subspace + int dim = unit.getIntervals().iterator().next().getDimension(); + CLIQUESubspace<V> subspace_d = denseSubspaces.get(Integer.valueOf(dim)); + if(subspace_d == null) { + subspace_d = new CLIQUESubspace<V>(dim); + denseSubspaces.put(Integer.valueOf(dim), subspace_d); } + subspace_d.addDenseUnit(unit); } } - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); msg.append(" number of 1-dim dense units: ").append(denseUnits.size()); msg.append("\n number of 1-dim dense subspace candidates: ").append(denseSubspaces.size()); - logger.debugFine(msg.toString()); + LOG.debugFine(msg.toString()); } List<CLIQUESubspace<V>> subspaceCandidates = new ArrayList<CLIQUESubspace<V>>(denseSubspaces.values()); @@ -574,7 +573,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -584,7 +583,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { protected int xsi; protected double tau; @@ -594,19 +593,22 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter xsiP = new IntParameter(XSI_ID, new GreaterConstraint(0)); + IntParameter xsiP = new IntParameter(XSI_ID); + xsiP.addConstraint(new GreaterConstraint(0)); if(config.grab(xsiP)) { - xsi = xsiP.getValue(); + xsi = xsiP.intValue(); } - DoubleParameter tauP = new DoubleParameter(TAU_ID, new IntervalConstraint(0, IntervalConstraint.IntervalBoundary.OPEN, 1, IntervalConstraint.IntervalBoundary.OPEN)); + DoubleParameter tauP = new DoubleParameter(TAU_ID); + tauP.addConstraint(new GreaterConstraint(0)); + tauP.addConstraint(new LessConstraint(1)); if(config.grab(tauP)) { - tau = tauP.getValue(); + tau = tauP.doubleValue(); } Flag pruneF = new Flag(PRUNE_ID); if(config.grab(pruneF)) { - prune = pruneF.getValue(); + prune = pruneF.isTrue(); } } |