diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace')
8 files changed, 467 insertions, 455 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index 01a693e4..37b3eb57 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -43,13 +43,13 @@ import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -57,7 +57,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.IntervalConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -96,11 +96,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications") @Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.") @Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314") -public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(CLIQUE.class); + private static final Logging LOG = Logging.getLogger(CLIQUE.class); /** * Parameter to specify the number of intervals (units) in each dimension, @@ -109,7 +109,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -clique.xsi} * </p> */ - public static final OptionID XSI_ID = OptionID.getOrCreateOptionID("clique.xsi", "The number of intervals (units) in each dimension."); + public static final OptionID XSI_ID = new OptionID("clique.xsi", "The number of intervals (units) in each dimension."); /** * Parameter to specify the density threshold for the selectivity of a unit, @@ -119,7 +119,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -clique.tau} * </p> */ - public static final OptionID TAU_ID = OptionID.getOrCreateOptionID("clique.tau", "The density threshold for the selectivity of a unit, where the selectivity is" + "the fraction of total feature vectors contained in this unit."); + public static final OptionID TAU_ID = new OptionID("clique.tau", "The density threshold for the selectivity of a unit, where the selectivity is" + "the fraction of total feature vectors contained in this unit."); /** * Flag to indicate that only subspaces with large coverage (i.e. the fraction @@ -129,7 +129,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -clique.prune} * </p> */ - public static final OptionID PRUNE_ID = OptionID.getOrCreateOptionID("clique.prune", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned."); + public static final OptionID PRUNE_ID = new OptionID("clique.prune", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned."); /** * Holds the value of {@link #XSI_ID}. @@ -169,53 +169,53 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus public Clustering<SubspaceModel<V>> run(Relation<V> relation) { // 1. Identification of subspaces that contain clusters // TODO: use step logging. - if(logger.isVerbose()) { - logger.verbose("*** 1. Identification of subspaces that contain clusters ***"); + if(LOG.isVerbose()) { + LOG.verbose("*** 1. Identification of subspaces that contain clusters ***"); } SortedMap<Integer, List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new TreeMap<Integer, List<CLIQUESubspace<V>>>(); List<CLIQUESubspace<V>> denseSubspaces = findOneDimensionalDenseSubspaces(relation); - dimensionToDenseSubspaces.put(0, denseSubspaces); - if(logger.isVerbose()) { - logger.verbose(" 1-dimensional dense subspaces: " + denseSubspaces.size()); + dimensionToDenseSubspaces.put(Integer.valueOf(0), denseSubspaces); + if(LOG.isVerbose()) { + LOG.verbose(" 1-dimensional dense subspaces: " + denseSubspaces.size()); } - if(logger.isDebugging()) { + if(LOG.isDebugging()) { for(CLIQUESubspace<V> s : denseSubspaces) { - logger.debug(s.toString(" ")); + LOG.debug(s.toString(" ")); } } - int dimensionality = DatabaseUtil.dimensionality(relation); + int dimensionality = RelationUtil.dimensionality(relation); for(int k = 2; k <= dimensionality && !denseSubspaces.isEmpty(); k++) { denseSubspaces = findDenseSubspaces(relation, denseSubspaces); - dimensionToDenseSubspaces.put(k - 1, denseSubspaces); - if(logger.isVerbose()) { - logger.verbose(" " + k + "-dimensional dense subspaces: " + denseSubspaces.size()); + dimensionToDenseSubspaces.put(Integer.valueOf(k - 1), denseSubspaces); + if(LOG.isVerbose()) { + LOG.verbose(" " + k + "-dimensional dense subspaces: " + denseSubspaces.size()); } - if(logger.isDebugging()) { + if(LOG.isDebugging()) { for(CLIQUESubspace<V> s : denseSubspaces) { - logger.debug(s.toString(" ")); + LOG.debug(s.toString(" ")); } } } // 2. Identification of clusters - if(logger.isVerbose()) { - logger.verbose("*** 2. Identification of clusters ***"); + if(LOG.isVerbose()) { + LOG.verbose("*** 2. Identification of clusters ***"); } // build result int numClusters = 1; Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("CLIQUE clustering", "clique-clustering"); for(Integer dim : dimensionToDenseSubspaces.keySet()) { List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim); - List<Pair<Subspace<V>, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces); + List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces); - if(logger.isVerbose()) { - logger.verbose(" " + (dim + 1) + "-dimensional clusters: " + modelsAndClusters.size()); + if(LOG.isVerbose()) { + LOG.verbose(" " + (dim + 1) + "-dimensional clusters: " + modelsAndClusters.size()); } - for(Pair<Subspace<V>, ModifiableDBIDs> modelAndCluster : modelsAndClusters) { + for(Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) { Cluster<SubspaceModel<V>> newCluster = new Cluster<SubspaceModel<V>>(modelAndCluster.second); - newCluster.setModel(new SubspaceModel<V>(modelAndCluster.first, DatabaseUtil.centroid(relation, modelAndCluster.second))); + newCluster.setModel(new SubspaceModel<V>(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).toVector(relation))); newCluster.setName("cluster_" + numClusters++); result.addCluster(newCluster); } @@ -232,13 +232,13 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @return the clusters in the specified dense subspaces and the corresponding * cluster models */ - private List<Pair<Subspace<V>, ModifiableDBIDs>> determineClusters(List<CLIQUESubspace<V>> denseSubspaces) { - List<Pair<Subspace<V>, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace<V>, ModifiableDBIDs>>(); + private List<Pair<Subspace, ModifiableDBIDs>> determineClusters(List<CLIQUESubspace<V>> denseSubspaces) { + List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace, ModifiableDBIDs>>(); for(CLIQUESubspace<V> subspace : denseSubspaces) { - List<Pair<Subspace<V>, ModifiableDBIDs>> clustersInSubspace = subspace.determineClusters(); - if(logger.isDebugging()) { - logger.debugFine("Subspace " + subspace + " clusters " + clustersInSubspace.size()); + List<Pair<Subspace, ModifiableDBIDs>> clustersInSubspace = subspace.determineClusters(); + if(LOG.isDebugging()) { + LOG.debugFine("Subspace " + subspace + " clusters " + clustersInSubspace.size()); } clusters.addAll(clustersInSubspace); } @@ -289,7 +289,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @return the created one dimensional units */ private Collection<CLIQUEUnit<V>> initOneDimensionalUnits(Relation<V> database) { - int dimensionality = DatabaseUtil.dimensionality(database); + int dimensionality = RelationUtil.dimensionality(database); // initialize minima and maxima double[] minima = new double[dimensionality]; double[] maxima = new double[dimensionality]; @@ -312,12 +312,12 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus unit_lengths[d] = (maxima[d] - minima[d]) / xsi; } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); msg.append(" minima: ").append(FormatUtil.format(minima, ", ", 2)); msg.append("\n maxima: ").append(FormatUtil.format(maxima, ", ", 2)); msg.append("\n unit lengths: ").append(FormatUtil.format(unit_lengths, ", ", 2)); - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } // determine the boundaries of the units @@ -332,10 +332,10 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus } } } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); msg.append(" unit bounds ").append(FormatUtil.format(new Matrix(unit_bounds), " ")); - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } // build the 1 dimensional units @@ -346,10 +346,10 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus } } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); msg.append(" total number of 1-dim units: ").append(units.size()); - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } return units; @@ -367,12 +367,12 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus if(minima.length != featureVector.getDimensionality()) { throw new IllegalArgumentException("FeatureVectors differ in length."); } - for(int d = 1; d <= featureVector.getDimensionality(); d++) { - if((featureVector.doubleValue(d)) > maxima[d - 1]) { - maxima[d - 1] = (featureVector.doubleValue(d)); + for(int d = 0; d < featureVector.getDimensionality(); d++) { + if((featureVector.doubleValue(d)) > maxima[d]) { + maxima[d] = (featureVector.doubleValue(d)); } - if((featureVector.doubleValue(d)) < minima[d - 1]) { - minima[d - 1] = (featureVector.doubleValue(d)); + if((featureVector.doubleValue(d)) < minima[d]) { + minima[d] = (featureVector.doubleValue(d)); } } } @@ -387,38 +387,37 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus */ private List<CLIQUESubspace<V>> findOneDimensionalDenseSubspaceCandidates(Relation<V> database) { Collection<CLIQUEUnit<V>> units = initOneDimensionalUnits(database); - Collection<CLIQUEUnit<V>> denseUnits = new ArrayList<CLIQUEUnit<V>>(); - Map<Integer, CLIQUESubspace<V>> denseSubspaces = new HashMap<Integer, CLIQUESubspace<V>>(); - // identify dense units double total = database.size(); - for(DBIDIter it = database.iterDBIDs(); it.valid();) { + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { V featureVector = database.get(it); - final DBID id = it.getDBID(); - it.advance(); for(CLIQUEUnit<V> unit : units) { - unit.addFeatureVector(id, featureVector); - // unit is a dense unit - // FIXME: why it.valid()? - if(!it.valid() && unit.selectivity(total) >= tau) { - denseUnits.add(unit); - // add the dense unit to its subspace - int dim = unit.getIntervals().iterator().next().getDimension(); - CLIQUESubspace<V> subspace_d = denseSubspaces.get(dim); - if(subspace_d == null) { - subspace_d = new CLIQUESubspace<V>(dim); - denseSubspaces.put(dim, subspace_d); - } - subspace_d.addDenseUnit(unit); + unit.addFeatureVector(it, featureVector); + } + } + + Collection<CLIQUEUnit<V>> denseUnits = new ArrayList<CLIQUEUnit<V>>(); + Map<Integer, CLIQUESubspace<V>> denseSubspaces = new HashMap<Integer, CLIQUESubspace<V>>(); + for(CLIQUEUnit<V> unit : units) { + // unit is a dense unit + if(unit.selectivity(total) >= tau) { + denseUnits.add(unit); + // add the dense unit to its subspace + int dim = unit.getIntervals().iterator().next().getDimension(); + CLIQUESubspace<V> subspace_d = denseSubspaces.get(Integer.valueOf(dim)); + if(subspace_d == null) { + subspace_d = new CLIQUESubspace<V>(dim); + denseSubspaces.put(Integer.valueOf(dim), subspace_d); } + subspace_d.addDenseUnit(unit); } } - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); msg.append(" number of 1-dim dense units: ").append(denseUnits.size()); msg.append("\n number of 1-dim dense subspace candidates: ").append(denseSubspaces.size()); - logger.debugFine(msg.toString()); + LOG.debugFine(msg.toString()); } List<CLIQUESubspace<V>> subspaceCandidates = new ArrayList<CLIQUESubspace<V>>(denseSubspaces.values()); @@ -574,7 +573,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -584,7 +583,7 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { protected int xsi; protected double tau; @@ -594,19 +593,22 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter xsiP = new IntParameter(XSI_ID, new GreaterConstraint(0)); + IntParameter xsiP = new IntParameter(XSI_ID); + xsiP.addConstraint(new GreaterConstraint(0)); if(config.grab(xsiP)) { - xsi = xsiP.getValue(); + xsi = xsiP.intValue(); } - DoubleParameter tauP = new DoubleParameter(TAU_ID, new IntervalConstraint(0, IntervalConstraint.IntervalBoundary.OPEN, 1, IntervalConstraint.IntervalBoundary.OPEN)); + DoubleParameter tauP = new DoubleParameter(TAU_ID); + tauP.addConstraint(new GreaterConstraint(0)); + tauP.addConstraint(new LessConstraint(1)); if(config.grab(tauP)) { - tau = tauP.getValue(); + tau = tauP.doubleValue(); } Flag pruneF = new Flag(PRUNE_ID); if(config.grab(pruneF)) { - prune = pruneF.getValue(); + prune = pruneF.isTrue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index df3fe8b5..a3496a0e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -47,6 +47,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DiSHDistanceFunction; @@ -55,10 +56,11 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelati import de.lmu.ifi.dbs.elki.index.preprocessed.preference.DiSHPreferenceVectorIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; +import de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyReferenceLists; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -99,11 +101,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("DiSH: Detecting Subspace cluster Hierarchies") @Description("Algorithm to find hierarchical correlation clusters in subspaces.") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15") -public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(DiSH.class); + private static final Logging LOG = Logging.getLogger(DiSH.class); /** * Parameter that specifies the maximum radius of the neighborhood to be @@ -116,7 +118,7 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * Key: {@code -dish.epsilon} * </p> */ - public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("dish.epsilon", "The maximum radius of the neighborhood " + "to be considered in each dimension for determination of " + "the preference vector."); + public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", "The maximum radius of the neighborhood " + "to be considered in each dimension for determination of " + "the preference vector."); /** * Parameter that specifies the a minimum number of points as a smoothing @@ -128,7 +130,7 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * Key: {@code -dish.mu} * </p> */ - public static final OptionID MU_ID = OptionID.getOrCreateOptionID("dish.mu", "The minimum number of points as a smoothing factor to avoid the single-link-effekt."); + public static final OptionID MU_ID = new OptionID("dish.mu", "The minimum number of points as a smoothing factor to avoid the single-link-effekt."); /** * Holds the value of {@link #EPSILON_ID}. @@ -167,13 +169,13 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste */ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { // Instantiate DiSH distance (and thus run the preprocessor) - if(logger.isVerbose()) { - logger.verbose("*** Run DiSH preprocessor."); + if (LOG.isVerbose()) { + LOG.verbose("*** Run DiSH preprocessor."); } DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation); // Configure and run OPTICS. - if(logger.isVerbose()) { - logger.verbose("*** Run OPTICS algorithm."); + if (LOG.isVerbose()) { + LOG.verbose("*** Run OPTICS algorithm."); } ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters); opticsconfig.addParameter(OPTICS.DISTANCE_FUNCTION_ID, ProxyDistanceFunction.proxy(dishDistanceQuery)); @@ -183,8 +185,8 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste optics = opticsconfig.tryInstantiate(cls); ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation); - if(logger.isVerbose()) { - logger.verbose("*** Compute Clusters."); + if (LOG.isVerbose()) { + LOG.verbose("*** Compute Clusters."); } return computeClusters(relation, opticsResult, dishDistanceQuery); } @@ -197,64 +199,64 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * @param distFunc Distance function */ private Clustering<SubspaceModel<V>> computeClusters(Relation<V> database, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder, DiSHDistanceFunction.Instance<V> distFunc) { - int dimensionality = DatabaseUtil.dimensionality(database); + int dimensionality = RelationUtil.dimensionality(database); int minpts = dishDistance.getMinpts(); // extract clusters Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder); - if(logger.isVerbose()) { - StringBuffer msg = new StringBuffer("Step 1: extract clusters"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append("\n").append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); + if (LOG.isVerbose()) { + StringBuilder msg = new StringBuilder("Step 1: extract clusters"); + for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); } } - logger.verbose(msg.toString()); + LOG.verbose(msg.toString()); } // check if there are clusters < minpts checkClusters(database, distFunc, clustersMap, minpts); - if(logger.isVerbose()) { - StringBuffer msg = new StringBuffer("Step 2: check clusters"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append("\n").append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); + if (LOG.isVerbose()) { + StringBuilder msg = new StringBuilder("Step 2: check clusters"); + for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); } } - logger.verbose(msg.toString()); + LOG.verbose(msg.toString()); } // sort the clusters List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap); - if(logger.isVerbose()) { - StringBuffer msg = new StringBuffer("Step 3: sort clusters"); - for(Cluster<SubspaceModel<V>> c : clusters) { - msg.append("\n").append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size()); + if (LOG.isVerbose()) { + StringBuilder msg = new StringBuilder("Step 3: sort clusters"); + for (Cluster<SubspaceModel<V>> c : clusters) { + msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size()); } - logger.verbose(msg.toString()); + LOG.verbose(msg.toString()); } // build the hierarchy buildHierarchy(database, distFunc, clusters, dimensionality); - if(logger.isVerbose()) { - StringBuffer msg = new StringBuffer("Step 4: build hierarchy"); - for(Cluster<SubspaceModel<V>> c : clusters) { - msg.append("\n").append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); - for(Cluster<SubspaceModel<V>> cluster : c.getParents()) { + if (LOG.isVerbose()) { + StringBuilder msg = new StringBuilder("Step 4: build hierarchy"); + for (Cluster<SubspaceModel<V>> c : clusters) { + msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); + for (Cluster<SubspaceModel<V>> cluster : c.getParents()) { msg.append("\n parent ").append(cluster); } - for(Cluster<SubspaceModel<V>> cluster : c.getChildren()) { + for (Cluster<SubspaceModel<V>> cluster : c.getChildren()) { msg.append("\n child ").append(cluster); } } - logger.verbose(msg.toString()); + LOG.verbose(msg.toString()); } // build result Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("DiSH clustering", "dish-clustering"); - for(Cluster<SubspaceModel<V>> c : clusters) { - if(c.getParents() == null || c.getParents().isEmpty()) { + for (Cluster<SubspaceModel<V>> c : clusters) { + if (c.getParents() == null || c.getParents().isEmpty()) { result.addCluster(c); } } @@ -270,12 +272,12 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * @return the extracted clusters */ private Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> extractClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder) { - FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), logger) : null; + FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), LOG) : null; int processed = 0; Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>>(); Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>>(); Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<DBID, Pair<BitSet, ArrayModifiableDBIDs>>(); - for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { + for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next(); entryMap.put(entry.getID(), entry); @@ -284,68 +286,68 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste // get the list of (parallel) clusters for the preference vector List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector); - if(parallelClusters == null) { + if (parallelClusters == null) { parallelClusters = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(); clustersMap.put(preferenceVector, parallelClusters); } // look for the proper cluster Pair<BitSet, ArrayModifiableDBIDs> cluster = null; - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - V c_centroid = DatabaseUtil.centroid(database, c.second, c.first); + for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database); PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector); - if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { + if (dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector()); - if(d <= 2 * epsilon) { + if (d <= 2 * epsilon) { cluster = c; break; } } } - if(cluster == null) { + if (cluster == null) { cluster = new Pair<BitSet, ArrayModifiableDBIDs>(preferenceVector, DBIDUtil.newArray()); parallelClusters.add(cluster); } cluster.second.add(entry.getID()); entryToClusterMap.put(entry.getID(), cluster); - if(progress != null) { - progress.setProcessed(++processed, logger); + if (progress != null) { + progress.setProcessed(++processed, LOG); } } - if(progress != null) { - progress.ensureCompleted(logger); + if (progress != null) { + progress.ensureCompleted(LOG); } - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer("Step 0"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append("\n").append(FormatUtil.format(DatabaseUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size()); + if (LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder("Step 0"); + for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size()); } } - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } // add the predecessor to the cluster - for(BitSet pv : clustersMap.keySet()) { + for (BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { - if(cluster.second.isEmpty()) { + for (Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { + if (cluster.second.isEmpty()) { continue; } DBID firstID = cluster.second.get(0); ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID); DBID predecessorID = entry.getPredecessorID(); - if(predecessorID == null) { + if (predecessorID == null) { continue; } ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID); // parallel cluster - if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { + if (predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { continue; } - if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) { + if (predecessor.getReachability().compareTo(entry.getReachability()) < 0) { continue; } @@ -369,22 +371,21 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * @return a sorted list of the clusters */ private List<Cluster<SubspaceModel<V>>> sortClusters(Relation<V> database, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { - final int db_dim = DatabaseUtil.dimensionality(database); + final int db_dim = RelationUtil.dimensionality(database); // int num = 1; List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<Cluster<SubspaceModel<V>>>(); - for(BitSet pv : clustersMap.keySet()) { + for (BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(int i = 0; i < parallelClusters.size(); i++) { + for (int i = 0; i < parallelClusters.size(); i++) { Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i); Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.second); - cluster.setModel(new SubspaceModel<V>(new Subspace<V>(c.first), DatabaseUtil.centroid(database, c.second))); + cluster.setModel(new SubspaceModel<V>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); cluster.setHierarchy(new HierarchyReferenceLists<Cluster<SubspaceModel<V>>>(cluster, new ArrayList<Cluster<SubspaceModel<V>>>(), new ArrayList<Cluster<SubspaceModel<V>>>())); // cluster.setName("Cluster_" + num++); String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, ""); - if(parallelClusters.size() > 1) { + if (parallelClusters.size() > 1) { cluster.setName("Cluster_" + subspace + "_" + i); - } - else { + } else { cluster.setName("Cluster_" + subspace); } clusters.add(cluster); @@ -417,11 +418,11 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(); Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>>(); Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<BitSet, ArrayModifiableDBIDs>(new BitSet(), DBIDUtil.newArray()); - for(BitSet pv : clustersMap.keySet()) { + for (BitSet pv : clustersMap.keySet()) { // noise - if(pv.cardinality() == 0) { + if (pv.cardinality() == 0) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { noise.second.addDBIDs(c.second); } } @@ -429,11 +430,10 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste else { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(parallelClusters.size()); - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - if(!pv.equals(new BitSet()) && c.second.size() < minpts) { + for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + if (!pv.equals(new BitSet()) && c.second.size() < minpts) { notAssigned.add(c); - } - else { + } else { newParallelClusters.add(c); } } @@ -444,15 +444,14 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste clustersMap.clear(); clustersMap.putAll(newClustersMap); - for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { - if(c.second.isEmpty()) { + for (Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { + if (c.second.isEmpty()) { continue; } Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap); - if(parent != null) { + if (parent != null) { parent.second.addDBIDs(c.second); - } - else { + } else { noise.second.addDBIDs(c.second); } } @@ -472,30 +471,30 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * @return the parent of the specified cluster */ private Pair<BitSet, ArrayModifiableDBIDs> findParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Pair<BitSet, ArrayModifiableDBIDs> child, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { - V child_centroid = DatabaseUtil.centroid(database, child.second, child.first); + V child_centroid = ProjectedCentroid.make(child.first, database, child.second).toVector(database); Pair<BitSet, ArrayModifiableDBIDs> result = null; int resultCardinality = -1; BitSet childPV = child.first; int childCardinality = childPV.cardinality(); - for(BitSet parentPV : clustersMap.keySet()) { + for (BitSet parentPV : clustersMap.keySet()) { int parentCardinality = parentPV.cardinality(); - if(parentCardinality >= childCardinality) { + if (parentCardinality >= childCardinality) { continue; } - if(resultCardinality != -1 && parentCardinality <= resultCardinality) { + if (resultCardinality != -1 && parentCardinality <= resultCardinality) { continue; } BitSet pv = (BitSet) childPV.clone(); pv.and(parentPV); - if(pv.equals(parentPV)) { + if (pv.equals(parentPV)) { List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV); - for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { - V parent_centroid = DatabaseUtil.centroid(database, parent.second, parentPV); + for (Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { + V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database); double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV); - if(d <= 2 * epsilon) { + if (d <= 2 * epsilon) { result = parent; resultCardinality = parentCardinality; break; @@ -516,64 +515,62 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * @param database the database containing the data objects */ private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) { - StringBuffer msg = new StringBuffer(); - final int db_dim = DatabaseUtil.dimensionality(database); + StringBuilder msg = new StringBuilder(); + final int db_dim = RelationUtil.dimensionality(database); - for(int i = 0; i < clusters.size() - 1; i++) { + for (int i = 0; i < clusters.size() - 1; i++) { Cluster<SubspaceModel<V>> c_i = clusters.get(i); int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality(); - V ci_centroid = DatabaseUtil.centroid(database, c_i.getIDs(), c_i.getModel().getDimensions()); + V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database); - for(int j = i + 1; j < clusters.size(); j++) { + for (int j = i + 1; j < clusters.size(); j++) { Cluster<SubspaceModel<V>> c_j = clusters.get(j); int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality(); - if(subspaceDim_i < subspaceDim_j) { - if(logger.isDebugging()) { - msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append("]"); - msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append("]"); + if (subspaceDim_i < subspaceDim_j) { + if (LOG.isDebugging()) { + msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']'); + msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']'); } // noise level reached - if(c_j.getModel().getSubspace().dimensionality() == 0) { + if (c_j.getModel().getSubspace().dimensionality() == 0) { // no parents exists -> parent is noise - if(c_i.getParents().isEmpty()) { + if (c_i.getParents().isEmpty()) { c_j.getChildren().add(c_i); c_i.getParents().add(c_j); - if(logger.isDebugging()) { + if (LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); - msg.append("]"); + msg.append(']'); } } - } - else { - V cj_centroid = DatabaseUtil.centroid(database, c_j.getIDs(), c_j.getModel().getDimensions()); + } else { + V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions()); double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector()); - if(logger.isDebugging()) { + if (LOG.isDebugging()) { msg.append("\n dist = ").append(distance.getCorrelationValue()); } - if(distance.getCorrelationValue() == subspaceDim_j) { - if(logger.isDebugging()) { + if (distance.getCorrelationValue() == subspaceDim_j) { + if (LOG.isDebugging()) { msg.append("\n d = ").append(d); } - if(d <= 2 * epsilon) { + if (d <= 2 * epsilon) { // no parent exists or c_j is not a parent of the already // existing parents - if(c_i.getParents().isEmpty() || !isParent(database, distFunc, c_j, c_i.getParents())) { + if (c_i.getParents().isEmpty() || !isParent(database, distFunc, c_j, c_i.getParents())) { c_j.getChildren().add(c_i); c_i.getParents().add(c_j); - if(logger.isDebugging()) { + if (LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of ["); msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); - msg.append("]"); + msg.append(']'); } } - } - else { + } else { throw new RuntimeException("Should never happen: d = " + d); } } @@ -581,8 +578,8 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste } } } - if(logger.isDebugging()) { - logger.debug(msg.toString()); + if (LOG.isDebugging()) { + LOG.debug(msg.toString()); } } @@ -599,14 +596,14 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * the children clusters, false otherwise */ private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, List<Cluster<SubspaceModel<V>>> children) { - V parent_centroid = DatabaseUtil.centroid(database, parent.getIDs(), parent.getModel().getDimensions()); - int dimensionality = DatabaseUtil.dimensionality(database); + V parent_centroid = ProjectedCentroid.make(parent.getModel().getDimensions(), database, parent.getIDs()).toVector(database); + int dimensionality = RelationUtil.dimensionality(database); int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality(); - for(Cluster<SubspaceModel<V>> child : children) { - V child_centroid = DatabaseUtil.centroid(database, child.getIDs(), child.getModel().getDimensions()); + for (Cluster<SubspaceModel<V>> child : children) { + V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions()); - if(distance.getCorrelationValue() == subspaceDim_parent) { + if (distance.getCorrelationValue() == subspaceDim_parent) { return true; } } @@ -621,7 +618,7 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -631,7 +628,7 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { protected double epsilon = 0.0; protected int mu = 1; @@ -644,14 +641,16 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste protected void makeOptions(Parameterization config) { super.makeOptions(config); - DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, new GreaterEqualConstraint(0), 0.001); - if(config.grab(epsilonP)) { - epsilon = epsilonP.getValue(); + DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, 0.001); + epsilonP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(epsilonP)) { + epsilon = epsilonP.doubleValue(); } - IntParameter muP = new IntParameter(MU_ID, new GreaterConstraint(0), 1); - if(config.grab(muP)) { - mu = muP.getValue(); + IntParameter muP = new IntParameter(MU_ID, 1); + muP.addConstraint(new GreaterConstraint(0)); + if (config.grab(muP)) { + mu = muP.intValue(); } configDiSHDistance(config, epsilon, mu); @@ -703,4 +702,4 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste return new DiSH<V>(epsilon, dishDistance, opticsO); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 40ab60a8..58f3acef 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -34,7 +34,8 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.IntervalConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -60,11 +61,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; @Title("Finding Hierarchies of Subspace Clusters")
@Description("Algorithm for detecting hierarchies of subspace clusters.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Finding Hierarchies of Subspace Clusters", booktitle = "Proc. 10th Europ. Conf. on Principles and Practice of Knowledge Discovery in Databases (PKDD'06), Berlin, Germany, 2006", url = "http://www.dbs.ifi.lmu.de/Publikationen/Papers/PKDD06-HiSC.pdf")
-public class HiSC<V extends NumberVector<V, ?>> extends OPTICS<V, PreferenceVectorBasedCorrelationDistance> {
+public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorBasedCorrelationDistance> {
/**
* The logger for this class.
*/
- private static final Logging logger = Logging.getLogger(HiSC.class);
+ private static final Logging LOG = Logging.getLogger(HiSC.class);
/**
* Constructor.
@@ -77,7 +78,7 @@ public class HiSC<V extends NumberVector<V, ?>> extends OPTICS<V, PreferenceVect @Override
protected Logging getLogger() {
- return logger;
+ return LOG;
}
/**
@@ -87,16 +88,18 @@ public class HiSC<V extends NumberVector<V, ?>> extends OPTICS<V, PreferenceVect *
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
HiSCDistanceFunction<V> distanceFunction;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, new IntervalConstraint(0.0, IntervalConstraint.IntervalBoundary.OPEN, 1.0, IntervalConstraint.IntervalBoundary.OPEN), HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
+ DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
+ alphaP.addConstraint(new GreaterConstraint(0.0)); + alphaP.addConstraint(new LessConstraint(1.0));
double alpha = 0.0;
if(config.grab(alphaP)) {
- alpha = alphaP.getValue();
+ alpha = alphaP.doubleValue();
}
// Configure HiSC distance function
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 4eedbecd..ef49ff10 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -23,15 +23,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.iterator.TIntIterator; +import gnu.trove.set.TIntSet; +import gnu.trove.set.hash.TIntHashSet; + import java.util.ArrayList; import java.util.BitSet; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; -import java.util.Set; import de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedClustering; import de.lmu.ifi.dbs.elki.data.Cluster; @@ -47,16 +49,20 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; -import de.lmu.ifi.dbs.elki.database.query.GenericDistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; +import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultUtil; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -64,7 +70,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.CTriple; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -90,11 +96,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("PROCLUS: PROjected CLUStering") @Description("Algorithm to find subspace clusters in high dimensional spaces.") @Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188") -public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(PROCLUS.class); + private static final Logging LOG = Logging.getLogger(PROCLUS.class); /** * Parameter to specify the multiplier for the initial number of medoids, must @@ -106,12 +112,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * Key: {@code -proclus.mi} * </p> */ - public static final OptionID M_I_ID = OptionID.getOrCreateOptionID("proclus.mi", "The multiplier for the initial number of medoids."); - - /** - * Parameter to specify the random generator seed. - */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("proclus.seed", "The random number generator seed."); + public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids."); /** * Holds the value of {@link #M_I_ID}. @@ -119,9 +120,9 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus private int m_i; /** - * Holds the value of {@link #SEED_ID}. + * Random generator */ - private Long seed; + private RandomFactory rnd; /** * Java constructor. @@ -130,12 +131,12 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @param k_i k_i Parameter * @param l l Parameter * @param m_i m_i Parameter - * @param seed Random generator seed + * @param rnd Random generator */ - public PROCLUS(int k, int k_i, int l, int m_i, Long seed) { + public PROCLUS(int k, int k_i, int l, int m_i, RandomFactory rnd) { super(k, k_i, l); this.m_i = m_i; - this.seed = seed; + this.rnd = rnd; } /** @@ -147,19 +148,16 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); - final Random random = new Random(); - if(seed != null) { - random.setSeed(seed); - } + final Random random = rnd.getRandom(); - if(DatabaseUtil.dimensionality(relation) < l) { - throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + DatabaseUtil.dimensionality(relation) + " < " + l + ")"); + if(RelationUtil.dimensionality(relation) < l) { + throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")"); } // TODO: use a StepProgress! // initialization phase - if(logger.isVerbose()) { - logger.verbose("1. Initialization phase..."); + if(LOG.isVerbose()) { + LOG.verbose("1. Initialization phase..."); } int sampleSize = Math.min(relation.size(), k_i * k); DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random.nextLong()); @@ -167,39 +165,39 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus int medoidSize = Math.min(relation.size(), m_i * k); DBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random); - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n"); - msg.append("sampleSize ").append(sampleSize).append("\n"); - msg.append("sampleSet ").append(sampleSet).append("\n"); - msg.append("medoidSize ").append(medoidSize).append("\n"); - msg.append("m ").append(medoids).append("\n"); - logger.debugFine(msg.toString()); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n'); + msg.append("sampleSize ").append(sampleSize).append('\n'); + msg.append("sampleSet ").append(sampleSet).append('\n'); + msg.append("medoidSize ").append(medoidSize).append('\n'); + msg.append("m ").append(medoids).append('\n'); + LOG.debugFine(msg.toString()); } // iterative phase - if(logger.isVerbose()) { - logger.verbose("2. Iterative phase..."); + if(LOG.isVerbose()) { + LOG.verbose("2. Iterative phase..."); } double bestObjective = Double.POSITIVE_INFINITY; ModifiableDBIDs m_best = null; ModifiableDBIDs m_bad = null; ModifiableDBIDs m_current = initialSet(medoids, k, random); - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n"); - msg.append("m_c ").append(m_current).append("\n"); - logger.debugFine(msg.toString()); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n'); + msg.append("m_c ").append(m_current).append('\n'); + LOG.debugFine(msg.toString()); } - IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Current number of clusters:", logger) : null; + IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null; // TODO: Use DataStore and Trove for performance Map<DBID, PROCLUSCluster> clusters = null; int loops = 0; while(loops < 10) { - Map<DBID, Set<Integer>> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); + Map<DBID, TIntSet> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); clusters = assignPoints(dimensions, relation); double objectiveFunction = evaluateClusters(clusters, dimensions, relation); @@ -214,20 +212,20 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus m_current = computeM_current(medoids, m_best, m_bad, random); loops++; if(cprogress != null) { - cprogress.setProcessed(clusters.size(), logger); + cprogress.setProcessed(clusters.size(), LOG); } } if(cprogress != null) { - cprogress.setCompleted(logger); + cprogress.setCompleted(LOG); } // refinement phase - if(logger.isVerbose()) { - logger.verbose("3. Refinement phase..."); + if(LOG.isVerbose()) { + LOG.verbose("3. Refinement phase..."); } - List<Pair<V, Set<Integer>>> dimensions = findDimensions(new ArrayList<PROCLUSCluster>(clusters.values()), relation); + List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<PROCLUSCluster>(clusters.values()), relation); List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation); // build result @@ -235,7 +233,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("ProClus clustering", "proclus-clustering"); for(PROCLUSCluster c : finalClusters) { Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.objectIDs); - cluster.setModel(new SubspaceModel<V>(new Subspace<V>(c.getDimensions()), c.centroid)); + cluster.setModel(new SubspaceModel<V>(new Subspace(c.getDimensions()), c.centroid)); cluster.setName("cluster_" + numClusters++); result.addCluster(cluster); @@ -259,40 +257,41 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus // m_1 is random point of S DBID m_i = s.remove(random.nextInt(s.size())); medoids.add(m_i); - if(logger.isDebugging()) { - logger.debugFiner("medoids " + medoids); + if(LOG.isDebugging()) { + LOG.debugFiner("medoids " + medoids); } // compute distances between each point in S and m_i - Map<DBID, DistanceResultPair<DoubleDistance>> distances = new HashMap<DBID, DistanceResultPair<DoubleDistance>>(); + // FIXME: don't use maps, so we can work with DBIDRef + Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<DBID, DistanceDBIDPair<DoubleDistance>>(); for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); + DBID id = DBIDUtil.deref(iter); DoubleDistance dist = distFunc.distance(id, m_i); - distances.put(id, new GenericDistanceResultPair<DoubleDistance>(dist, id)); + distances.put(id, DBIDUtil.newDistancePair(dist, id)); } for(int i = 1; i < m; i++) { // choose medoid m_i to be far from prevois medoids - List<DistanceResultPair<DoubleDistance>> d = new ArrayList<DistanceResultPair<DoubleDistance>>(distances.values()); - Collections.sort(d); + List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<DistanceDBIDPair<DoubleDistance>>(distances.values()); + DistanceDBIDResultUtil.sortByDistance(d); - m_i = d.get(d.size() - 1).getDBID(); + m_i = DBIDUtil.deref(d.get(d.size() - 1)); medoids.add(m_i); s.remove(m_i); distances.remove(m_i); // compute distances of each point to closest medoid for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); + DBID id = DBIDUtil.deref(iter); DoubleDistance dist_new = distFunc.distance(id, m_i); DoubleDistance dist_old = distances.get(id).getDistance(); DoubleDistance dist = dist_new.compareTo(dist_old) < 0 ? dist_new : dist_old; - distances.put(id, new GenericDistanceResultPair<DoubleDistance>(dist, id)); + distances.put(id, DBIDUtil.newDistancePair(dist, id)); } - if(logger.isDebugging()) { - logger.debugFiner("medoids " + medoids); + if(LOG.isDebugging()) { + LOG.debugFiner("medoids " + medoids); } } @@ -332,7 +331,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus ModifiableDBIDs m_current = DBIDUtil.newHashSet(); for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { - DBID m_i = iter.getDBID(); + DBID m_i = DBIDUtil.deref(iter); if(m_bad.contains(m_i)) { int currentSize = m_current.size(); while(m_current.size() == currentSize) { @@ -359,17 +358,17 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @param distFunc the distance function * @return a mapping of the medoid's id to its locality */ - private Map<DBID, List<DistanceResultPair<DoubleDistance>>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { - Map<DBID, List<DistanceResultPair<DoubleDistance>>> result = new HashMap<DBID, List<DistanceResultPair<DoubleDistance>>>(); + private Map<DBID, DistanceDBIDResult<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { + Map<DBID, DistanceDBIDResult<DoubleDistance>> result = new HashMap<DBID, DistanceDBIDResult<DoubleDistance>>(); for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m = iter.getDBID(); + DBID m = DBIDUtil.deref(iter); // determine minimum distance between current medoid m and any other // medoid m_i DoubleDistance minDist = null; for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { - DBID m_i = iter2.getDBID(); - if(m_i == m) { + DBID m_i = DBIDUtil.deref(iter2); + if(DBIDUtil.equal(m_i, m)) { continue; } DoubleDistance currentDist = distFunc.distance(m, m_i); @@ -380,7 +379,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus // determine points in sphere centered at m with radius minDist assert minDist != null; - List<DistanceResultPair<DoubleDistance>> qr = rangeQuery.getRangeForDBID(m, minDist); + DistanceDBIDResult<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist); result.put(m, qr); } @@ -397,23 +396,23 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @return the set of correlated dimensions for each medoid in the specified * medoid set */ - private Map<DBID, Set<Integer>> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { + private Map<DBID, TIntSet> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { // get localities - Map<DBID, List<DistanceResultPair<DoubleDistance>>> localities = getLocalities(medoids, database, distFunc, rangeQuery); + Map<DBID, DistanceDBIDResult<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery); // compute x_ij = avg distance from points in l_i to medoid m_i - int dim = DatabaseUtil.dimensionality(database); + int dim = RelationUtil.dimensionality(database); Map<DBID, double[]> averageDistances = new HashMap<DBID, double[]>(); for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m_i = iter.getDBID(); + DBID m_i = DBIDUtil.deref(iter); V medoid_i = database.get(m_i); - List<DistanceResultPair<DoubleDistance>> l_i = localities.get(m_i); + DistanceDBIDResult<DoubleDistance> l_i = localities.get(m_i); double[] x_i = new double[dim]; - for(DistanceResultPair<DoubleDistance> qr : l_i) { - V o = database.get(qr.getDBID()); + for(DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { + V o = database.get(qr); for(int d = 0; d < dim; d++) { - x_i[d] += Math.abs(medoid_i.doubleValue(d + 1) - o.doubleValue(d + 1)); + x_i[d] += Math.abs(medoid_i.doubleValue(d) - o.doubleValue(d)); } } for(int d = 0; d < dim; d++) { @@ -422,11 +421,11 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus averageDistances.put(m_i, x_i); } - Map<DBID, Set<Integer>> dimensionMap = new HashMap<DBID, Set<Integer>>(); + Map<DBID, TIntSet> dimensionMap = new HashMap<DBID, TIntSet>(); List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<CTriple<Double, DBID, Integer>>(); for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m_i = iter.getDBID(); - Set<Integer> dims_i = new HashSet<Integer>(); + DBID m_i = DBIDUtil.deref(iter); + TIntSet dims_i = new TIntHashSet(); dimensionMap.put(m_i, dims_i); double[] x_i = averageDistances.get(m_i); @@ -447,7 +446,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus sigma_i = Math.sqrt(sigma_i); for(int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<Double, DBID, Integer>((x_i[j] - y_i) / sigma_i, m_i, j + 1)); + z_ijs.add(new CTriple<Double, DBID, Integer>((x_i[j] - y_i) / sigma_i, m_i, j)); } } Collections.sort(z_ijs); @@ -455,15 +454,15 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus int max = Math.max(k * l, 2); for(int m = 0; m < max; m++) { CTriple<Double, DBID, Integer> z_ij = z_ijs.get(m); - Set<Integer> dims_i = dimensionMap.get(z_ij.getSecond()); + TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); dims_i.add(z_ij.getThird()); - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n"); - msg.append("z_ij ").append(z_ij).append("\n"); - msg.append("D_i ").append(dims_i).append("\n"); - logger.debugFiner(msg.toString()); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n'); + msg.append("z_ij ").append(z_ij).append('\n'); + msg.append("D_i ").append(dims_i).append('\n'); + LOG.debugFiner(msg.toString()); } } return dimensionMap; @@ -478,9 +477,9 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @return the set of correlated dimensions for each specified cluster * centroid */ - private List<Pair<V, Set<Integer>>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) { + private List<Pair<V, TIntSet>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) { // compute x_ij = avg distance from points in c_i to c_i.centroid - int dim = DatabaseUtil.dimensionality(database); + int dim = RelationUtil.dimensionality(database); Map<Integer, double[]> averageDistances = new HashMap<Integer, double[]>(); for(int i = 0; i < clusters.size(); i++) { @@ -489,7 +488,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); for(int d = 0; d < dim; d++) { - x_i[d] += Math.abs(c_i.centroid.doubleValue(d + 1) - o.doubleValue(d + 1)); + x_i[d] += Math.abs(c_i.centroid.doubleValue(d) - o.doubleValue(d)); } } for(int d = 0; d < dim; d++) { @@ -518,38 +517,38 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus sigma_i = Math.sqrt(sigma_i); for(int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<Double, Integer, Integer>((x_i[j] - y_i) / sigma_i, i, j + 1)); + z_ijs.add(new CTriple<Double, Integer, Integer>((x_i[j] - y_i) / sigma_i, i, j)); } } Collections.sort(z_ijs); // mapping cluster index -> dimensions - Map<Integer, Set<Integer>> dimensionMap = new HashMap<Integer, Set<Integer>>(); + Map<Integer, TIntSet> dimensionMap = new HashMap<Integer, TIntSet>(); int max = Math.max(k * l, 2); for(int m = 0; m < max; m++) { CTriple<Double, Integer, Integer> z_ij = z_ijs.get(m); - Set<Integer> dims_i = dimensionMap.get(z_ij.getSecond()); + TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); if(dims_i == null) { - dims_i = new HashSet<Integer>(); + dims_i = new TIntHashSet(); dimensionMap.put(z_ij.getSecond(), dims_i); } dims_i.add(z_ij.getThird()); - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n"); - msg.append("z_ij ").append(z_ij).append("\n"); - msg.append("D_i ").append(dims_i).append("\n"); - logger.debugFiner(msg.toString()); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n'); + msg.append("z_ij ").append(z_ij).append('\n'); + msg.append("D_i ").append(dims_i).append('\n'); + LOG.debugFiner(msg.toString()); } } // mapping cluster -> dimensions - List<Pair<V, Set<Integer>>> result = new ArrayList<Pair<V, Set<Integer>>>(); + List<Pair<V, TIntSet>> result = new ArrayList<Pair<V, TIntSet>>(); for(int i : dimensionMap.keySet()) { - Set<Integer> dims_i = dimensionMap.get(i); + TIntSet dims_i = dimensionMap.get(i); PROCLUSCluster c_i = clusters.get(i); - result.add(new Pair<V, Set<Integer>>(c_i.centroid, dims_i)); + result.add(new Pair<V, TIntSet>(c_i.centroid, dims_i)); } return result; } @@ -562,26 +561,26 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @param database the database containing the objects * @return the assignments of the object to the clusters */ - private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, Set<Integer>> dimensions, Relation<V> database) { + private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, TIntSet> dimensions, Relation<V> database) { Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<DBID, ModifiableDBIDs>(); for(DBID m_i : dimensions.keySet()) { clusterIDs.put(m_i, DBIDUtil.newHashSet()); } for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { - DBID p_id = it.getDBID(); + DBID p_id = DBIDUtil.deref(it); V p = database.get(p_id); - DistanceResultPair<DoubleDistance> minDist = null; + DistanceDBIDPair<DoubleDistance> minDist = null; for(DBID m_i : dimensions.keySet()) { V m = database.get(m_i); - DistanceResultPair<DoubleDistance> currentDist = new GenericDistanceResultPair<DoubleDistance>(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i); - if(minDist == null || currentDist.compareTo(minDist) < 0) { + DistanceDBIDPair<DoubleDistance> currentDist = DBIDUtil.newDistancePair(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i); + if(minDist == null || currentDist.compareByDistance(minDist) < 0) { minDist = currentDist; } } // add p to cluster with mindist assert minDist != null; - ModifiableDBIDs ids = clusterIDs.get(minDist.getDBID()); + ModifiableDBIDs ids = clusterIDs.get(DBIDUtil.deref(minDist)); ids.add(p_id); } @@ -589,17 +588,17 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus for(DBID m_i : dimensions.keySet()) { ModifiableDBIDs objectIDs = clusterIDs.get(m_i); if(!objectIDs.isEmpty()) { - Set<Integer> clusterDimensions = dimensions.get(m_i); - V centroid = DatabaseUtil.centroid(database, objectIDs); + TIntSet clusterDimensions = dimensions.get(m_i); + V centroid = Centroid.make(database, objectIDs).toVector(database); clusters.put(m_i, new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); } } - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n"); - msg.append("clusters ").append(clusters).append("\n"); - logger.debugFine(msg.toString()); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n'); + msg.append("clusters ").append(clusters).append('\n'); + LOG.debugFine(msg.toString()); } return clusters; } @@ -612,20 +611,20 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @param database the database containing the objects * @return the assignments of the object to the clusters */ - private List<PROCLUSCluster> finalAssignment(List<Pair<V, Set<Integer>>> dimensions, Relation<V> database) { + private List<PROCLUSCluster> finalAssignment(List<Pair<V, TIntSet>> dimensions, Relation<V> database) { Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<Integer, ModifiableDBIDs>(); for(int i = 0; i < dimensions.size(); i++) { clusterIDs.put(i, DBIDUtil.newHashSet()); } for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { - DBID p_id = it.getDBID(); + DBID p_id = DBIDUtil.deref(it); V p = database.get(p_id); Pair<DoubleDistance, Integer> minDist = null; for(int i = 0; i < dimensions.size(); i++) { - Pair<V, Set<Integer>> pair_i = dimensions.get(i); + Pair<V, TIntSet> pair_i = dimensions.get(i); V c_i = pair_i.first; - Set<Integer> dimensions_i = pair_i.second; + TIntSet dimensions_i = pair_i.second; DoubleDistance currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i); if(minDist == null || currentDist.compareTo(minDist.first) < 0) { minDist = new Pair<DoubleDistance, Integer>(currentDist, i); @@ -641,17 +640,17 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus for(int i = 0; i < dimensions.size(); i++) { ModifiableDBIDs objectIDs = clusterIDs.get(i); if(!objectIDs.isEmpty()) { - Set<Integer> clusterDimensions = dimensions.get(i).second; - V centroid = DatabaseUtil.centroid(database, objectIDs); + TIntSet clusterDimensions = dimensions.get(i).second; + V centroid = Centroid.make(database, objectIDs).toVector(database); clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); } } - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n"); - msg.append("clusters ").append(clusters).append("\n"); - logger.debugFine(msg.toString()); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n'); + msg.append("clusters ").append(clusters).append('\n'); + LOG.debugFine(msg.toString()); } return clusters; } @@ -666,9 +665,10 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @return the Manhattan segmental distance between o1 and o2 relative to the * specified dimensions */ - private DoubleDistance manhattanSegmentalDistance(V o1, V o2, Set<Integer> dimensions) { + private DoubleDistance manhattanSegmentalDistance(V o1, V o2, TIntSet dimensions) { double result = 0; - for(Integer d : dimensions) { + for (TIntIterator iter = dimensions.iterator(); iter.hasNext(); ) { + final int d = iter.next(); result += Math.abs(o1.doubleValue(d) - o2.doubleValue(d)); } result /= dimensions.size(); @@ -683,15 +683,16 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @param database the database holding the objects * @return a measure for the cluster quality */ - private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, Set<Integer>> dimensions, Relation<V> database) { + private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, TIntSet> dimensions, Relation<V> database) { double result = 0; for(DBID m_i : clusters.keySet()) { PROCLUSCluster c_i = clusters.get(m_i); V centroid_i = c_i.centroid; - Set<Integer> dims_i = dimensions.get(m_i); + TIntSet dims_i = dimensions.get(m_i); double w_i = 0; - for(Integer j : dims_i) { + for (TIntIterator iter = dims_i.iterator(); iter.hasNext(); ) { + final int j = iter.next(); w_i += avgDistance(centroid_i, c_i.objectIDs, database, j); } @@ -714,12 +715,12 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * specified dimension */ private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) { - double avg = 0; + Mean avg = new Mean(); for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); - avg += Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension)); + avg.put(Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension))); } - return avg / objectIDs.size(); + return avg.getMean(); } /** @@ -748,7 +749,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -765,7 +766,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus /** * The correlated dimensions of this cluster. */ - Set<Integer> dimensions; + TIntSet dimensions; /** * The centroids of this cluster along each dimension. @@ -779,7 +780,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * @param dimensions the correlated dimensions of this cluster * @param centroid the centroid of this cluster */ - public PROCLUSCluster(ModifiableDBIDs objectIDs, Set<Integer> dimensions, V centroid) { + public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, V centroid) { this.objectIDs = objectIDs; this.dimensions = dimensions; this.centroid = centroid; @@ -787,19 +788,19 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus @Override public String toString() { - StringBuffer result = new StringBuffer(); + StringBuilder result = new StringBuilder(); result.append("Dimensions: ["); boolean notFirst = false; - for(Integer d : dimensions) { + for(TIntIterator iter = dimensions.iterator(); iter.hasNext(); ) { if(notFirst) { - result.append(","); + result.append(','); } else { notFirst = true; } - result.append(d); + result.append(iter.next()); } - result.append("]"); + result.append(']'); result.append("\nCentroid: ").append(centroid); return result.toString(); @@ -812,8 +813,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus */ public BitSet getDimensions() { BitSet result = new BitSet(); - for(int d : dimensions) { - result.set(d - 1); + for(TIntIterator iter = dimensions.iterator(); iter.hasNext(); ) { + result.set(iter.next()); } return result; } @@ -826,10 +827,15 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractProjectedClustering.Parameterizer { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedClustering.Parameterizer { + /** + * Parameter to specify the random generator seed. + */ + public static final OptionID SEED_ID = new OptionID("proclus.seed", "The random number generator seed."); + protected int m_i = -1; - protected Long seed = null; + protected RandomFactory rnd; @Override protected void makeOptions(Parameterization config) { @@ -845,15 +851,15 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus m_i = m_iP.getValue(); } - LongParameter seedP = new LongParameter(SEED_ID, true); - if(config.grab(seedP)) { - seed = seedP.getValue(); + RandomParameter rndP = new RandomParameter(SEED_ID); + if(config.grab(rndP)) { + rnd = rndP.getValue(); } } @Override protected PROCLUS<V> makeInstance() { - return new PROCLUS<V>(k, k_i, l, m_i, seed); + return new PROCLUS<V>(k, k_i, l, m_i, rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java index 4ca5a564..fc3228eb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java @@ -58,11 +58,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz @Title("PreDeCon: Subspace Preference weighted Density Connected Clustering") @Description("PreDeCon computes clusters of subspace preference weighted connected points. " + "The algorithm searches for local subgroups of a set of feature vectors having " + "a low variance along one or more (but not all) attributes.") @Reference(authors = "C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density Connected Clustering with Local Subspace Preferences", booktitle = "Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004", url = "http://dx.doi.org/10.1109/ICDM.2004.10087") -public class PreDeCon<V extends NumberVector<V, ?>> extends AbstractProjectedDBSCAN<Clustering<Model>, V> { +public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN<Clustering<Model>, V> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(PreDeCon.class); + private static final Logging LOG = Logging.getLogger(PreDeCon.class); /** * Constructor. @@ -88,7 +88,7 @@ public class PreDeCon<V extends NumberVector<V, ?>> extends AbstractProjectedDBS @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -98,7 +98,7 @@ public class PreDeCon<V extends NumberVector<V, ?>> extends AbstractProjectedDBS * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractProjectedDBSCAN.Parameterizer<V, DoubleDistance> { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedDBSCAN.Parameterizer<V, DoubleDistance> { @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index c47c74b6..46c5f0b8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -42,12 +42,13 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.AbstractDimensionsSelectingDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; -import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -84,11 +85,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @Title("SUBCLU: Density connected Subspace Clustering") @Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.") @Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004") -public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(SUBCLU.class); + private static final Logging LOG = Logging.getLogger(SUBCLU.class); /** * The distance function to determine the distance between database objects. @@ -99,7 +100,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -subclu.distancefunction} * </p> */ - public static final OptionID DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("subclu.distancefunction", "Distance function to determine the distance between database objects."); + public static final OptionID DISTANCE_FUNCTION_ID = new OptionID("subclu.distancefunction", "Distance function to determine the distance between database objects."); /** * Parameter to specify the maximum radius of the neighborhood to be @@ -109,7 +110,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -subclu.epsilon} * </p> */ - public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("subclu.epsilon", "The maximum radius of the neighborhood to be considered."); + public static final OptionID EPSILON_ID = new OptionID("subclu.epsilon", "The maximum radius of the neighborhood to be considered."); /** * Parameter to specify the threshold for minimum number of points in the @@ -118,7 +119,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * Key: {@code -subclu.minpts} * </p> */ - public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("subclu.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); + public static final OptionID MINPTS_ID = new OptionID("subclu.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); /** * Holds the instance of the distance function specified by @@ -162,36 +163,36 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @return Clustering result */ public Clustering<SubspaceModel<V>> run(Relation<V> relation) { - final int dimensionality = DatabaseUtil.dimensionality(relation); + final int dimensionality = RelationUtil.dimensionality(relation); - StepProgress stepprog = logger.isVerbose() ? new StepProgress(dimensionality) : null; + StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters if(stepprog != null) { - stepprog.beginStep(1, "Generate all 1-dimensional clusters.", logger); + stepprog.beginStep(1, "Generate all 1-dimensional clusters.", LOG); } // mapping of dimensionality to set of subspaces - HashMap<Integer, List<Subspace<V>>> subspaceMap = new HashMap<Integer, List<Subspace<V>>>(); + HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<Integer, List<Subspace>>(); // list of 1-dimensional subspaces containing clusters - List<Subspace<V>> s_1 = new ArrayList<Subspace<V>>(); + List<Subspace> s_1 = new ArrayList<Subspace>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters - TreeMap<Subspace<V>, List<Cluster<Model>>> clusterMap = new TreeMap<Subspace<V>, List<Cluster<Model>>>(new Subspace.DimensionComparator()); + TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<Subspace, List<Cluster<Model>>>(new Subspace.DimensionComparator()); for(int d = 0; d < dimensionality; d++) { - Subspace<V> currentSubspace = new Subspace<V>(d); + Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); - if(logger.isDebuggingFiner()) { - StringBuffer msg = new StringBuffer(); - msg.append("\n").append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n"); + if(LOG.isDebuggingFiner()) { + StringBuilder msg = new StringBuilder(); + msg.append('\n').append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n"); for(Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } - logger.debugFiner(msg.toString()); + LOG.debugFiner(msg.toString()); } if(!clusters.isEmpty()) { @@ -203,26 +204,26 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus // Generate (d+1)-dimensional clusters from d-dimensional clusters for(int d = 0; d < dimensionality - 1; d++) { if(stepprog != null) { - stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", logger); + stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } - List<Subspace<V>> subspaces = subspaceMap.get(d); + List<Subspace> subspaces = subspaceMap.get(d); if(subspaces == null || subspaces.isEmpty()) { if(stepprog != null) { for(int dim = d + 1; dim < dimensionality - 1; dim++) { - stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", logger); + stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } break; } - List<Subspace<V>> candidates = generateSubspaceCandidates(subspaces); - List<Subspace<V>> s_d = new ArrayList<Subspace<V>>(); + List<Subspace> candidates = generateSubspaceCandidates(subspaces); + List<Subspace> s_d = new ArrayList<Subspace>(); - for(Subspace<V> candidate : candidates) { - Subspace<V> bestSubspace = bestSubspace(subspaces, candidate, clusterMap); - if(logger.isDebuggingFine()) { - logger.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); + for(Subspace candidate : candidates) { + Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); + if(LOG.isDebuggingFine()) { + LOG.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); @@ -234,13 +235,13 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus } } - if(logger.isDebuggingFine()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebuggingFine()) { + StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); for(Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } - logger.debugFine(msg.toString()); + LOG.debugFine(msg.toString()); } if(!clusters.isEmpty()) { @@ -257,18 +258,18 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus // build result int numClusters = 1; result = new Clustering<SubspaceModel<V>>("SUBCLU clustering", "subclu-clustering"); - for(Subspace<V> subspace : clusterMap.descendingKeySet()) { + for(Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); for(Cluster<Model> cluster : clusters) { Cluster<SubspaceModel<V>> newCluster = new Cluster<SubspaceModel<V>>(cluster.getIDs()); - newCluster.setModel(new SubspaceModel<V>(subspace, DatabaseUtil.centroid(relation, cluster.getIDs()))); + newCluster.setModel(new SubspaceModel<V>(subspace, Centroid.make(relation, cluster.getIDs()).toVector(relation))); newCluster.setName("cluster_" + numClusters++); result.addCluster(newCluster); } } if(stepprog != null) { - stepprog.setCompleted(logger); + stepprog.setCompleted(LOG); } return result; } @@ -294,7 +295,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @param subspace the subspace to run DBSCAN on * @return the clustering result of the DBSCAN run */ - private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace<V> subspace) { + private List<Cluster<Model>> runDBSCAN(Relation<V> relation, DBIDs ids, Subspace subspace) { // distance function distanceFunction.setSelectedDimensions(subspace.getDimensions()); @@ -309,8 +310,8 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus DBSCAN<V, DoubleDistance> dbscan = new DBSCAN<V, DoubleDistance>(distanceFunction, epsilon, minpts); // run DBSCAN - if(logger.isVerbose()) { - logger.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); + if(LOG.isVerbose()) { + LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); } Clustering<Model> dbsres = dbscan.run(proxy); @@ -332,8 +333,8 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @param subspaces the {@code d}-dimensional subspaces * @return the {@code d+1}-dimensional subspace candidates */ - private List<Subspace<V>> generateSubspaceCandidates(List<Subspace<V>> subspaces) { - List<Subspace<V>> candidates = new ArrayList<Subspace<V>>(); + private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) { + List<Subspace> candidates = new ArrayList<Subspace>(); if(subspaces.isEmpty()) { return candidates; @@ -342,28 +343,28 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus // Generate (d+1)-dimensional candidate subspaces int d = subspaces.get(0).dimensionality(); - StringBuffer msgFine = new StringBuffer("\n"); - if(logger.isDebuggingFiner()) { - msgFine.append("subspaces ").append(subspaces).append("\n"); + StringBuilder msgFine = new StringBuilder("\n"); + if(LOG.isDebuggingFiner()) { + msgFine.append("subspaces ").append(subspaces).append('\n'); } for(int i = 0; i < subspaces.size(); i++) { - Subspace<V> s1 = subspaces.get(i); + Subspace s1 = subspaces.get(i); for(int j = i + 1; j < subspaces.size(); j++) { - Subspace<V> s2 = subspaces.get(j); - Subspace<V> candidate = s1.join(s2); + Subspace s2 = subspaces.get(j); + Subspace candidate = s1.join(s2); if(candidate != null) { - if(logger.isDebuggingFiner()) { - msgFine.append("candidate: ").append(candidate.dimensonsToString()).append("\n"); + if(LOG.isDebuggingFiner()) { + msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n'); } // prune irrelevant candidate subspaces - List<Subspace<V>> lowerSubspaces = lowerSubspaces(candidate); - if(logger.isDebuggingFiner()) { - msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append("\n"); + List<Subspace> lowerSubspaces = lowerSubspaces(candidate); + if(LOG.isDebuggingFiner()) { + msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n'); } boolean irrelevantCandidate = false; - for(Subspace<V> s : lowerSubspaces) { + for(Subspace s : lowerSubspaces) { if(!subspaces.contains(s)) { irrelevantCandidate = true; break; @@ -376,16 +377,16 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus } } - if(logger.isDebuggingFiner()) { - logger.debugFiner(msgFine.toString()); + if(LOG.isDebuggingFiner()) { + LOG.debugFiner(msgFine.toString()); } - if(logger.isDebugging()) { - StringBuffer msg = new StringBuffer(); + if(LOG.isDebugging()) { + StringBuilder msg = new StringBuilder(); msg.append(d + 1).append("-dimensional candidate subspaces: "); - for(Subspace<V> candidate : candidates) { - msg.append(candidate.dimensonsToString()).append(" "); + for(Subspace candidate : candidates) { + msg.append(candidate.dimensonsToString()).append(' '); } - logger.debug(msg.toString()); + LOG.debug(msg.toString()); } return candidates; @@ -398,19 +399,19 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @param subspace the {@code d}-dimensional subspace * @return a list of all {@code (d-1)}-dimensional subspaces */ - private List<Subspace<V>> lowerSubspaces(Subspace<V> subspace) { + private List<Subspace> lowerSubspaces(Subspace subspace) { int dimensionality = subspace.dimensionality(); if(dimensionality <= 1) { return null; } // order result according to the dimensions - List<Subspace<V>> result = new ArrayList<Subspace<V>>(); + List<Subspace> result = new ArrayList<Subspace>(); BitSet dimensions = subspace.getDimensions(); for(int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) { BitSet newDimensions = (BitSet) dimensions.clone(); newDimensions.set(dim, false); - result.add(new Subspace<V>(newDimensions)); + result.add(new Subspace(newDimensions)); } return result; @@ -428,10 +429,10 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * -dimensional candidate with minimal number of objects in the * cluster */ - private Subspace<V> bestSubspace(List<Subspace<V>> subspaces, Subspace<V> candidate, TreeMap<Subspace<V>, List<Cluster<Model>>> clusterMap) { - Subspace<V> bestSubspace = null; + private Subspace bestSubspace(List<Subspace> subspaces, Subspace candidate, TreeMap<Subspace, List<Cluster<Model>>> clusterMap) { + Subspace bestSubspace = null; - for(Subspace<V> subspace : subspaces) { + for(Subspace subspace : subspaces) { int min = Integer.MAX_VALUE; if(subspace.isSubspace(candidate)) { @@ -456,7 +457,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -466,7 +467,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { protected int minpts = 0; protected DoubleDistance epsilon = null; @@ -486,7 +487,8 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus epsilon = epsilonP.getValue(); } - IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0)); + IntParameter minptsP = new IntParameter(MINPTS_ID); + minptsP.addConstraint(new GreaterConstraint(0)); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java index eff71a35..6b22b233 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java @@ -46,7 +46,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param <V> the type of NumberVector this subspace contains */ -public class CLIQUESubspace<V extends NumberVector<V, ?>> extends Subspace<V> { +public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { /** * The dense units belonging to this subspace. */ @@ -103,14 +103,14 @@ public class CLIQUESubspace<V extends NumberVector<V, ?>> extends Subspace<V> { * * @return the clusters in this subspace and the corresponding cluster models */ - public List<Pair<Subspace<V>, ModifiableDBIDs>> determineClusters() { - List<Pair<Subspace<V>, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace<V>, ModifiableDBIDs>>(); + public List<Pair<Subspace, ModifiableDBIDs>> determineClusters() { + List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace, ModifiableDBIDs>>(); for(CLIQUEUnit<V> unit : getDenseUnits()) { if(!unit.isAssigned()) { ModifiableDBIDs cluster = DBIDUtil.newHashSet(); CLIQUESubspace<V> model = new CLIQUESubspace<V>(getDimensions()); - clusters.add(new Pair<Subspace<V>, ModifiableDBIDs>(model, cluster)); + clusters.add(new Pair<Subspace, ModifiableDBIDs>(model, cluster)); dfs(unit, cluster, model); } } @@ -151,8 +151,8 @@ public class CLIQUESubspace<V extends NumberVector<V, ?>> extends Subspace<V> { * @param dim the dimension * @return the left neighbor of the given unit in the specified dimension */ - public CLIQUEUnit<V> leftNeighbor(CLIQUEUnit<V> unit, Integer dim) { - Interval i = unit.getInterval(dim); + public CLIQUEUnit<V> leftNeighbor(CLIQUEUnit<V> unit, int dim) { + Interval i = unit.getInterval(Integer.valueOf(dim)); for(CLIQUEUnit<V> u : getDenseUnits()) { if(u.containsLeftNeighbor(i)) { @@ -238,10 +238,10 @@ public class CLIQUESubspace<V extends NumberVector<V, ?>> extends Subspace<V> { */ @Override public String toString(String pre) { - StringBuffer result = new StringBuffer(); + StringBuilder result = new StringBuilder(); result.append(super.toString(pre)); - result.append("\n").append(pre).append("Coverage: ").append(coverage); - result.append("\n").append(pre).append("Units: " + "\n"); + result.append('\n').append(pre).append("Coverage: ").append(coverage); + result.append('\n').append(pre).append("Units: " + "\n"); for(CLIQUEUnit<V> denseUnit : getDenseUnits()) { result.append(pre).append(" ").append(denseUnit.toString()).append(" ").append(denseUnit.getIds().size()).append(" objects\n"); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java index db687567..70f251c9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java @@ -23,15 +23,15 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.HashMap; +import gnu.trove.map.hash.TIntObjectHashMap; + import java.util.Iterator; -import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; import de.lmu.ifi.dbs.elki.data.Interval; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; @@ -46,7 +46,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; * * @param <V> the type of NumberVector this unit contains */ -public class CLIQUEUnit<V extends NumberVector<V, ?>> { +public class CLIQUEUnit<V extends NumberVector<?>> { /** * The one-dimensional intervals of which this unit is build. */ @@ -56,7 +56,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { * Provides a mapping of particular dimensions to the intervals of which this * unit is build. */ - private Map<Integer, Interval> dimensionToInterval; + private TIntObjectHashMap<Interval> dimensionToInterval; /** * The ids of the feature vectors this unit contains. @@ -77,7 +77,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { public CLIQUEUnit(SortedSet<Interval> intervals, ModifiableDBIDs ids) { this.intervals = intervals; - dimensionToInterval = new HashMap<Integer, Interval>(); + dimensionToInterval = new TIntObjectHashMap<Interval>(); for(Interval interval : intervals) { dimensionToInterval.put(interval.getDimension(), interval); } @@ -96,7 +96,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { intervals = new TreeSet<Interval>(); intervals.add(interval); - dimensionToInterval = new HashMap<Integer, Interval>(); + dimensionToInterval = new TIntObjectHashMap<Interval>(); dimensionToInterval.put(interval.getDimension(), interval); ids = DBIDUtil.newHashSet(); @@ -114,7 +114,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { */ public boolean contains(V vector) { for(Interval interval : intervals) { - double value = vector.doubleValue(interval.getDimension() + 1); + final double value = vector.doubleValue(interval.getDimension()); if(interval.getMin() > value || value >= interval.getMax()) { return false; } @@ -131,7 +131,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { * @return true, if this unit contains the specified feature vector, false * otherwise */ - public boolean addFeatureVector(DBID id, V vector) { + public boolean addFeatureVector(DBIDRef id, V vector) { if(contains(vector)) { ids.add(id); return true; @@ -284,9 +284,9 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> { */ @Override public String toString() { - StringBuffer result = new StringBuffer(); + StringBuilder result = new StringBuilder(); for(Interval interval : intervals) { - result.append(interval).append(" "); + result.append(interval).append(' '); } return result.toString(); |