diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java | 402 |
1 files changed, 156 insertions, 246 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 68878aef..8075a9f7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,270 +23,109 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; - import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.COPACNeighborPredicate; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.ClusterModel; import de.lmu.ifi.dbs.elki.data.model.DimensionModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ProxyDatabase; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.FilteredLocalPCABasedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.index.preprocessed.LocalProjectionIndex; -import de.lmu.ifi.dbs.elki.index.preprocessed.LocalProjectionIndex.Factory; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner; +import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** - * Provides the COPAC algorithm, an algorithm to partition a database according - * to the correlation dimension of its objects and to then perform an arbitrary - * clustering algorithm over the partitions. + * COPAC is an algorithm to partition a database according to the correlation + * dimension of its objects and to then perform an arbitrary clustering + * algorithm over the partitions. + * + * Reference: * <p> - * Reference: Achtert E., Böhm C., Kriegel H.-P., Kröger P., Zimek A.: Robust, - * Complete, and Efficient Correlation Clustering. <br> + * E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek:<br /> + * Robust, Complete, and Efficient Correlation Clustering.<br /> * In Proc. 7th SIAM International Conference on Data Mining (SDM'07), * Minneapolis, MN, 2007 * </p> * * @author Arthur Zimek * - * @apiviz.uses LocalProjectionIndex - * @apiviz.uses FilteredLocalPCABasedDistanceFunction * @apiviz.has DimensionModel + * @apiviz.composedOf COPACNeighborPredicate * * @param <V> the type of NumberVector handled by this Algorithm */ @Title("COPAC: COrrelation PArtition Clustering") -@Description("Partitions a database according to the correlation dimension of its objects and performs " + "a clustering algorithm over the partitions.") -@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger P., A. Zimek", title = "Robust, Complete, and Efficient Correlation Clustering", booktitle = "Proc. 7th SIAM International Conference on Data Mining (SDM'07), Minneapolis, MN, 2007", url = "http://www.siam.org/proceedings/datamining/2007/dm07_037achtert.pdf") -public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { +@Description("Partitions a database according to the correlation dimension of its objects and performs " // + + "a clustering algorithm over the partitions.") +@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek", // +title = "Robust, Complete, and Efficient Correlation Clustering", // +booktitle = "Proc. 7th SIAM International Conference on Data Mining (SDM'07), Minneapolis, MN, 2007", // +url = "http://www.siam.org/proceedings/datamining/2007/dm07_037achtert.pdf") +public class COPAC<V extends NumberVector> extends AbstractAlgorithm<Clustering<DimensionModel>> implements ClusteringAlgorithm<Clustering<DimensionModel>> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(COPAC.class); /** - * Parameter to specify the local PCA preprocessor to derive partition - * criterion, must extend - * {@link de.lmu.ifi.dbs.elki.index.preprocessed.localpca.AbstractFilteredPCAIndex}. - * <p> - * Key: {@code -copac.preprocessor} - * </p> - */ - public static final OptionID PREPROCESSOR_ID = new OptionID("copac.preprocessor", "Local PCA Preprocessor to derive partition criterion."); - - /** - * Parameter to specify the distance function to use inside the partitions - * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.AbstractIndexBasedDistanceFunction} - * . - * <p> - * Default value: - * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction} - * </p> - * <p> - * Key: {@code -copac.partitionDistance} - * </p> - */ - public static final OptionID PARTITION_DISTANCE_ID = new OptionID("copac.partitionDistance", "Distance to use for the inner algorithms."); - - /** - * Parameter to specify the clustering algorithm to apply to each partition, - * must extend - * {@link de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm}. - * <p> - * Key: {@code -copac.partitionAlgorithm} - * </p> - */ - public static final OptionID PARTITION_ALGORITHM_ID = new OptionID("copac.partitionAlgorithm", "Clustering algorithm to apply to each partition."); - - /** - * Holds the instance of the preprocessed distance function - * {@link #PARTITION_DISTANCE_ID}. - */ - private FilteredLocalPCABasedDistanceFunction<V, ?, D> partitionDistanceFunction; - - /** - * Get the algorithm to run on each partition. - */ - private Class<? extends ClusteringAlgorithm<Clustering<Model>>> partitionAlgorithm; - - /** - * Holds the parameters of the algorithm to run on each partition. - */ - private Collection<Pair<OptionID, Object>> partitionAlgorithmParameters; - - /** - * The last used distance query + * Settings class. */ - // FIXME: remove this when migrating to a full Factory pattern! This is - // non-reentrant! - private FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D> partitionDistanceQuery; + COPAC.Settings settings; /** * Constructor. * - * @param partitionDistanceFunction Distance function - * @param partitionAlgorithm Algorithm to use on partitions - * @param partitionAlgorithmParameters Parameters for Algorithm to run on - * partitions + * @param settings COPAC parameters */ - public COPAC(FilteredLocalPCABasedDistanceFunction<V, ?, D> partitionDistanceFunction, Class<? extends ClusteringAlgorithm<Clustering<Model>>> partitionAlgorithm, Collection<Pair<OptionID, Object>> partitionAlgorithmParameters) { + public COPAC(COPAC.Settings settings) { super(); - this.partitionDistanceFunction = partitionDistanceFunction; - this.partitionAlgorithm = partitionAlgorithm; - this.partitionAlgorithmParameters = partitionAlgorithmParameters; - } - - /** - * Performs the COPAC algorithm on the given database. - * - * @param relation Relation to process - * @return Clustering result - */ - @SuppressWarnings("unchecked") - public Clustering<Model> run(Relation<V> relation) { - if(LOG.isVerbose()) { - LOG.verbose("Running COPAC on db size = " + relation.size() + " with dimensionality = " + RelationUtil.dimensionality(relation)); - } - - partitionDistanceQuery = (FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D>) partitionDistanceFunction.instantiate(relation); - LocalProjectionIndex<V, ?> preprocin = partitionDistanceQuery.getIndex(); - - // partitioning - Map<Integer, ModifiableDBIDs> partitionMap = new HashMap<>(); - FiniteProgress partitionProgress = LOG.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), LOG) : null; - int processed = 1; - - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - int corrdim = preprocin.getLocalProjection(iditer).getCorrelationDimension(); - - if(!partitionMap.containsKey(corrdim)) { - partitionMap.put(corrdim, DBIDUtil.newArray()); - } - - partitionMap.get(corrdim).add(iditer); - if(partitionProgress != null) { - partitionProgress.setProcessed(processed++, LOG); - } - } - - if(partitionProgress != null) { - partitionProgress.ensureCompleted(LOG); - } - if(LOG.isVerbose()) { - for(Integer corrDim : partitionMap.keySet()) { - ModifiableDBIDs list = partitionMap.get(corrDim); - LOG.verbose("Partition [corrDim = " + corrDim + "]: " + list.size() + " objects."); - } - } - - // convert for partition algorithm. - // TODO: do this with DynamicDBIDs instead - Map<Integer, DBIDs> pmap = new HashMap<>(); - for(Entry<Integer, ModifiableDBIDs> ent : partitionMap.entrySet()) { - pmap.put(ent.getKey(), ent.getValue()); - } - // running partition algorithm - return runPartitionAlgorithm(relation, pmap, partitionDistanceQuery); + this.settings = settings; } /** - * Runs the partition algorithm and creates the result. + * Run the COPAC algorithm. * - * @param relation the database to run this algorithm on - * @param partitionMap the map of partition IDs to object ids - * @param query The preprocessor based query function + * @param database Database + * @param relation Vector field relation + * @return COPAC clustering */ - private Clustering<Model> runPartitionAlgorithm(Relation<V> relation, Map<Integer, DBIDs> partitionMap, DistanceQuery<V, D> query) { - Clustering<Model> result = new Clustering<>("COPAC clustering", "copac-clustering"); - - // TODO: use an extra finite progress for the partitions? - for(Entry<Integer, DBIDs> pair : partitionMap.entrySet()) { - // noise partition - if(pair.getKey() == RelationUtil.dimensionality(relation)) { - // Make a Noise cluster - result.addToplevelCluster(new Cluster<Model>(pair.getValue(), true, ClusterModel.CLUSTER)); - } - else { - DBIDs partids = pair.getValue(); - ProxyDatabase proxy = new ProxyDatabase(partids, relation); - - ClusteringAlgorithm<Clustering<Model>> partitionAlgorithm = getPartitionAlgorithm(query); - if(LOG.isVerbose()) { - LOG.verbose("Running " + partitionAlgorithm.getClass().getName() + " on partition [corrDim = " + pair.getKey() + "]..."); - } - Clustering<Model> p = partitionAlgorithm.run(proxy); - // Re-Wrap resulting Clusters as DimensionModel clusters. - for(Cluster<Model> clus : p.getAllClusters()) { - if(clus.isNoise()) { - result.addToplevelCluster(new Cluster<Model>(clus.getIDs(), true, ClusterModel.CLUSTER)); - } - else { - result.addToplevelCluster(new Cluster<Model>(clus.getIDs(), new DimensionModel(pair.getKey()))); - } - } + public Clustering<DimensionModel> run(Database database, Relation<V> relation) { + COPACNeighborPredicate.Instance npred = new COPACNeighborPredicate<V>(settings).instantiate(database, relation); + CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database, TypeUtil.DBIDS); + Clustering<Model> dclusters = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run(); + // Re-wrap the detected clusters for COPAC: + Clustering<DimensionModel> result = new Clustering<>("COPAC clustering", "copac-clustering"); + // Generalized DBSCAN clusterings will be flat. + for(Hierarchy.Iter<Cluster<Model>> iter = dclusters.iterToplevelClusters(); iter.valid(); iter.advance()) { + Cluster<Model> clus = iter.get(); + if(clus.size() > 0) { + int dim = npred.dimensionality(clus.getIDs().iter()); + DimensionModel model = new DimensionModel(dim); + result.addToplevelCluster(new Cluster<>(clus.getIDs(), model)); } } return result; } - /** - * Returns the partition algorithm. - * - * @return the specified partition algorithm - */ - public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) { - ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters); - ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query); - reconfig.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist); - ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm); - reconfig.failOnErrors(); - return instance; - } - - /** - * Get the last used distance query (to expose access to the preprocessor) - * - * Used by ERiC. TODO: migrate to factory pattern! - * - * @return distance query - */ - public FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D> getPartitionDistanceQuery() { - return partitionDistanceQuery; - } - @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); @@ -298,57 +137,128 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs } /** - * Parameterization class. + * Class to wrap the COPAC settings. * * @author Erich Schubert * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractParameterizer { - protected LocalProjectionIndex.Factory<V, ?> indexI = null; - - protected FilteredLocalPCABasedDistanceFunction<V, ?, D> pdistI = null; - - protected Class<? extends ClusteringAlgorithm<Clustering<Model>>> algC = null; + public static class Settings { + /** + * Neighborhood size. + */ + public int k; + + /** + * Class to compute PCA. + */ + public PCAFilteredRunner pca; + + /** + * Epsilon value for GDBSCAN. + */ + public double epsilon; + + /** + * MinPts parameter. + */ + public int minpts; + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Size for the kNN neighborhood used in the PCA step of COPAC. + */ + public static final OptionID K_ID = new OptionID("copac.knn", "Number of neighbors to use for PCA."); + + /** + * Settings to build. + */ + Settings settings; + + @Override + public void makeOptions(Parameterization config) { + settings = new Settings(); + configK(config); + // TODO: allow using other PCA runners? + settings.pca = config.tryInstantiate(PCAFilteredRunner.class); + configEpsilon(config); + configMinPts(config); + } - protected Collection<Pair<OptionID, Object>> algO = null; + /** + * Configure the kNN parameter. + * + * @param config Parameter source + */ + protected void configK(Parameterization config) { + IntParameter kP = new IntParameter(K_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { + settings.k = kP.intValue(); + } + } - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - ClassParameter<Factory<V, ?>> indexP = new ClassParameter<>(PREPROCESSOR_ID, LocalProjectionIndex.Factory.class); - if(config.grab(indexP)) { - indexI = indexP.instantiateClass(config); + /** + * Configure the epsilon radius parameter. + * + * @param config Parameter source + */ + protected void configEpsilon(Parameterization config) { + DoubleParameter epsilonP = new DoubleParameter(DBSCAN.Parameterizer.EPSILON_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(epsilonP)) { + settings.epsilon = epsilonP.doubleValue(); + } } - ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>> pdistP = new ObjectParameter<>(PARTITION_DISTANCE_ID, FilteredLocalPCABasedDistanceFunction.class, LocallyWeightedDistanceFunction.class); - if(config.grab(pdistP)) { - ListParameterization predefinedDist = new ListParameterization(); - predefinedDist.addParameter(IndexBasedDistanceFunction.INDEX_ID, indexI); - ChainedParameterization chainDist = new ChainedParameterization(predefinedDist, config); - chainDist.errorsTo(config); - pdistI = pdistP.instantiateClass(chainDist); - predefinedDist.reportInternalParameterizationErrors(config); + /** + * Configure the minPts aka "mu" parameter. + * + * @param config Parameter source + */ + protected void configMinPts(Parameterization config) { + IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minptsP)) { + settings.minpts = minptsP.intValue(); + } } - // Parameterize algorithm: - ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class); - if(config.grab(algP)) { - ListParameterization predefined = new ListParameterization(); - predefined.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI); - TrackParameters trackpar = new TrackParameters(config); - ChainedParameterization chain = new ChainedParameterization(predefined, trackpar); - chain.errorsTo(config); - algP.instantiateClass(chain); - algC = algP.getValue(); - algO = trackpar.getGivenParameters(); - predefined.reportInternalParameterizationErrors(chain); + @Override + public Settings makeInstance() { + return settings; } } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * COPAC settings. + */ + protected COPAC.Settings settings; + + @Override + protected void makeOptions(Parameterization config) { + settings = config.tryInstantiate(COPAC.Settings.class); + } @Override - protected COPAC<V, D> makeInstance() { - return new COPAC<>(pdistI, algC, algO); + protected COPAC<V> makeInstance() { + return new COPAC<>(settings); } } }
\ No newline at end of file |