summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java402
1 files changed, 156 insertions, 246 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
index 68878aef..8075a9f7 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,270 +23,109 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
-
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.COPACNeighborPredicate;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.DimensionModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.FilteredLocalPCABasedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.index.preprocessed.LocalProjectionIndex;
-import de.lmu.ifi.dbs.elki.index.preprocessed.LocalProjectionIndex.Factory;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
- * Provides the COPAC algorithm, an algorithm to partition a database according
- * to the correlation dimension of its objects and to then perform an arbitrary
- * clustering algorithm over the partitions.
+ * COPAC is an algorithm to partition a database according to the correlation
+ * dimension of its objects and to then perform an arbitrary clustering
+ * algorithm over the partitions.
+ *
+ * Reference:
* <p>
- * Reference: Achtert E., Böhm C., Kriegel H.-P., Kröger P., Zimek A.: Robust,
- * Complete, and Efficient Correlation Clustering. <br>
+ * E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek:<br />
+ * Robust, Complete, and Efficient Correlation Clustering.<br />
* In Proc. 7th SIAM International Conference on Data Mining (SDM'07),
* Minneapolis, MN, 2007
* </p>
*
* @author Arthur Zimek
*
- * @apiviz.uses LocalProjectionIndex
- * @apiviz.uses FilteredLocalPCABasedDistanceFunction
* @apiviz.has DimensionModel
+ * @apiviz.composedOf COPACNeighborPredicate
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
@Title("COPAC: COrrelation PArtition Clustering")
-@Description("Partitions a database according to the correlation dimension of its objects and performs " + "a clustering algorithm over the partitions.")
-@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger P., A. Zimek", title = "Robust, Complete, and Efficient Correlation Clustering", booktitle = "Proc. 7th SIAM International Conference on Data Mining (SDM'07), Minneapolis, MN, 2007", url = "http://www.siam.org/proceedings/datamining/2007/dm07_037achtert.pdf")
-public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
+@Description("Partitions a database according to the correlation dimension of its objects and performs " //
+ + "a clustering algorithm over the partitions.")
+@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek", //
+title = "Robust, Complete, and Efficient Correlation Clustering", //
+booktitle = "Proc. 7th SIAM International Conference on Data Mining (SDM'07), Minneapolis, MN, 2007", //
+url = "http://www.siam.org/proceedings/datamining/2007/dm07_037achtert.pdf")
+public class COPAC<V extends NumberVector> extends AbstractAlgorithm<Clustering<DimensionModel>> implements ClusteringAlgorithm<Clustering<DimensionModel>> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(COPAC.class);
/**
- * Parameter to specify the local PCA preprocessor to derive partition
- * criterion, must extend
- * {@link de.lmu.ifi.dbs.elki.index.preprocessed.localpca.AbstractFilteredPCAIndex}.
- * <p>
- * Key: {@code -copac.preprocessor}
- * </p>
- */
- public static final OptionID PREPROCESSOR_ID = new OptionID("copac.preprocessor", "Local PCA Preprocessor to derive partition criterion.");
-
- /**
- * Parameter to specify the distance function to use inside the partitions
- * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.AbstractIndexBasedDistanceFunction}
- * .
- * <p>
- * Default value:
- * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction}
- * </p>
- * <p>
- * Key: {@code -copac.partitionDistance}
- * </p>
- */
- public static final OptionID PARTITION_DISTANCE_ID = new OptionID("copac.partitionDistance", "Distance to use for the inner algorithms.");
-
- /**
- * Parameter to specify the clustering algorithm to apply to each partition,
- * must extend
- * {@link de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm}.
- * <p>
- * Key: {@code -copac.partitionAlgorithm}
- * </p>
- */
- public static final OptionID PARTITION_ALGORITHM_ID = new OptionID("copac.partitionAlgorithm", "Clustering algorithm to apply to each partition.");
-
- /**
- * Holds the instance of the preprocessed distance function
- * {@link #PARTITION_DISTANCE_ID}.
- */
- private FilteredLocalPCABasedDistanceFunction<V, ?, D> partitionDistanceFunction;
-
- /**
- * Get the algorithm to run on each partition.
- */
- private Class<? extends ClusteringAlgorithm<Clustering<Model>>> partitionAlgorithm;
-
- /**
- * Holds the parameters of the algorithm to run on each partition.
- */
- private Collection<Pair<OptionID, Object>> partitionAlgorithmParameters;
-
- /**
- * The last used distance query
+ * Settings class.
*/
- // FIXME: remove this when migrating to a full Factory pattern! This is
- // non-reentrant!
- private FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D> partitionDistanceQuery;
+ COPAC.Settings settings;
/**
* Constructor.
*
- * @param partitionDistanceFunction Distance function
- * @param partitionAlgorithm Algorithm to use on partitions
- * @param partitionAlgorithmParameters Parameters for Algorithm to run on
- * partitions
+ * @param settings COPAC parameters
*/
- public COPAC(FilteredLocalPCABasedDistanceFunction<V, ?, D> partitionDistanceFunction, Class<? extends ClusteringAlgorithm<Clustering<Model>>> partitionAlgorithm, Collection<Pair<OptionID, Object>> partitionAlgorithmParameters) {
+ public COPAC(COPAC.Settings settings) {
super();
- this.partitionDistanceFunction = partitionDistanceFunction;
- this.partitionAlgorithm = partitionAlgorithm;
- this.partitionAlgorithmParameters = partitionAlgorithmParameters;
- }
-
- /**
- * Performs the COPAC algorithm on the given database.
- *
- * @param relation Relation to process
- * @return Clustering result
- */
- @SuppressWarnings("unchecked")
- public Clustering<Model> run(Relation<V> relation) {
- if(LOG.isVerbose()) {
- LOG.verbose("Running COPAC on db size = " + relation.size() + " with dimensionality = " + RelationUtil.dimensionality(relation));
- }
-
- partitionDistanceQuery = (FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D>) partitionDistanceFunction.instantiate(relation);
- LocalProjectionIndex<V, ?> preprocin = partitionDistanceQuery.getIndex();
-
- // partitioning
- Map<Integer, ModifiableDBIDs> partitionMap = new HashMap<>();
- FiniteProgress partitionProgress = LOG.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), LOG) : null;
- int processed = 1;
-
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- int corrdim = preprocin.getLocalProjection(iditer).getCorrelationDimension();
-
- if(!partitionMap.containsKey(corrdim)) {
- partitionMap.put(corrdim, DBIDUtil.newArray());
- }
-
- partitionMap.get(corrdim).add(iditer);
- if(partitionProgress != null) {
- partitionProgress.setProcessed(processed++, LOG);
- }
- }
-
- if(partitionProgress != null) {
- partitionProgress.ensureCompleted(LOG);
- }
- if(LOG.isVerbose()) {
- for(Integer corrDim : partitionMap.keySet()) {
- ModifiableDBIDs list = partitionMap.get(corrDim);
- LOG.verbose("Partition [corrDim = " + corrDim + "]: " + list.size() + " objects.");
- }
- }
-
- // convert for partition algorithm.
- // TODO: do this with DynamicDBIDs instead
- Map<Integer, DBIDs> pmap = new HashMap<>();
- for(Entry<Integer, ModifiableDBIDs> ent : partitionMap.entrySet()) {
- pmap.put(ent.getKey(), ent.getValue());
- }
- // running partition algorithm
- return runPartitionAlgorithm(relation, pmap, partitionDistanceQuery);
+ this.settings = settings;
}
/**
- * Runs the partition algorithm and creates the result.
+ * Run the COPAC algorithm.
*
- * @param relation the database to run this algorithm on
- * @param partitionMap the map of partition IDs to object ids
- * @param query The preprocessor based query function
+ * @param database Database
+ * @param relation Vector field relation
+ * @return COPAC clustering
*/
- private Clustering<Model> runPartitionAlgorithm(Relation<V> relation, Map<Integer, DBIDs> partitionMap, DistanceQuery<V, D> query) {
- Clustering<Model> result = new Clustering<>("COPAC clustering", "copac-clustering");
-
- // TODO: use an extra finite progress for the partitions?
- for(Entry<Integer, DBIDs> pair : partitionMap.entrySet()) {
- // noise partition
- if(pair.getKey() == RelationUtil.dimensionality(relation)) {
- // Make a Noise cluster
- result.addToplevelCluster(new Cluster<Model>(pair.getValue(), true, ClusterModel.CLUSTER));
- }
- else {
- DBIDs partids = pair.getValue();
- ProxyDatabase proxy = new ProxyDatabase(partids, relation);
-
- ClusteringAlgorithm<Clustering<Model>> partitionAlgorithm = getPartitionAlgorithm(query);
- if(LOG.isVerbose()) {
- LOG.verbose("Running " + partitionAlgorithm.getClass().getName() + " on partition [corrDim = " + pair.getKey() + "]...");
- }
- Clustering<Model> p = partitionAlgorithm.run(proxy);
- // Re-Wrap resulting Clusters as DimensionModel clusters.
- for(Cluster<Model> clus : p.getAllClusters()) {
- if(clus.isNoise()) {
- result.addToplevelCluster(new Cluster<Model>(clus.getIDs(), true, ClusterModel.CLUSTER));
- }
- else {
- result.addToplevelCluster(new Cluster<Model>(clus.getIDs(), new DimensionModel(pair.getKey())));
- }
- }
+ public Clustering<DimensionModel> run(Database database, Relation<V> relation) {
+ COPACNeighborPredicate.Instance npred = new COPACNeighborPredicate<V>(settings).instantiate(database, relation);
+ CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database, TypeUtil.DBIDS);
+ Clustering<Model> dclusters = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
+ // Re-wrap the detected clusters for COPAC:
+ Clustering<DimensionModel> result = new Clustering<>("COPAC clustering", "copac-clustering");
+ // Generalized DBSCAN clusterings will be flat.
+ for(Hierarchy.Iter<Cluster<Model>> iter = dclusters.iterToplevelClusters(); iter.valid(); iter.advance()) {
+ Cluster<Model> clus = iter.get();
+ if(clus.size() > 0) {
+ int dim = npred.dimensionality(clus.getIDs().iter());
+ DimensionModel model = new DimensionModel(dim);
+ result.addToplevelCluster(new Cluster<>(clus.getIDs(), model));
}
}
return result;
}
- /**
- * Returns the partition algorithm.
- *
- * @return the specified partition algorithm
- */
- public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) {
- ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters);
- ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query);
- reconfig.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist);
- ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm);
- reconfig.failOnErrors();
- return instance;
- }
-
- /**
- * Get the last used distance query (to expose access to the preprocessor)
- *
- * Used by ERiC. TODO: migrate to factory pattern!
- *
- * @return distance query
- */
- public FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D> getPartitionDistanceQuery() {
- return partitionDistanceQuery;
- }
-
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
@@ -298,57 +137,128 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs
}
/**
- * Parameterization class.
+ * Class to wrap the COPAC settings.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractParameterizer {
- protected LocalProjectionIndex.Factory<V, ?> indexI = null;
-
- protected FilteredLocalPCABasedDistanceFunction<V, ?, D> pdistI = null;
-
- protected Class<? extends ClusteringAlgorithm<Clustering<Model>>> algC = null;
+ public static class Settings {
+ /**
+ * Neighborhood size.
+ */
+ public int k;
+
+ /**
+ * Class to compute PCA.
+ */
+ public PCAFilteredRunner pca;
+
+ /**
+ * Epsilon value for GDBSCAN.
+ */
+ public double epsilon;
+
+ /**
+ * MinPts parameter.
+ */
+ public int minpts;
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Size for the kNN neighborhood used in the PCA step of COPAC.
+ */
+ public static final OptionID K_ID = new OptionID("copac.knn", "Number of neighbors to use for PCA.");
+
+ /**
+ * Settings to build.
+ */
+ Settings settings;
+
+ @Override
+ public void makeOptions(Parameterization config) {
+ settings = new Settings();
+ configK(config);
+ // TODO: allow using other PCA runners?
+ settings.pca = config.tryInstantiate(PCAFilteredRunner.class);
+ configEpsilon(config);
+ configMinPts(config);
+ }
- protected Collection<Pair<OptionID, Object>> algO = null;
+ /**
+ * Configure the kNN parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configK(Parameterization config) {
+ IntParameter kP = new IntParameter(K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ settings.k = kP.intValue();
+ }
+ }
- @Override
- protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
- ClassParameter<Factory<V, ?>> indexP = new ClassParameter<>(PREPROCESSOR_ID, LocalProjectionIndex.Factory.class);
- if(config.grab(indexP)) {
- indexI = indexP.instantiateClass(config);
+ /**
+ * Configure the epsilon radius parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configEpsilon(Parameterization config) {
+ DoubleParameter epsilonP = new DoubleParameter(DBSCAN.Parameterizer.EPSILON_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(epsilonP)) {
+ settings.epsilon = epsilonP.doubleValue();
+ }
}
- ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>> pdistP = new ObjectParameter<>(PARTITION_DISTANCE_ID, FilteredLocalPCABasedDistanceFunction.class, LocallyWeightedDistanceFunction.class);
- if(config.grab(pdistP)) {
- ListParameterization predefinedDist = new ListParameterization();
- predefinedDist.addParameter(IndexBasedDistanceFunction.INDEX_ID, indexI);
- ChainedParameterization chainDist = new ChainedParameterization(predefinedDist, config);
- chainDist.errorsTo(config);
- pdistI = pdistP.instantiateClass(chainDist);
- predefinedDist.reportInternalParameterizationErrors(config);
+ /**
+ * Configure the minPts aka "mu" parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configMinPts(Parameterization config) {
+ IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minptsP)) {
+ settings.minpts = minptsP.intValue();
+ }
}
- // Parameterize algorithm:
- ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class);
- if(config.grab(algP)) {
- ListParameterization predefined = new ListParameterization();
- predefined.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI);
- TrackParameters trackpar = new TrackParameters(config);
- ChainedParameterization chain = new ChainedParameterization(predefined, trackpar);
- chain.errorsTo(config);
- algP.instantiateClass(chain);
- algC = algP.getValue();
- algO = trackpar.getGivenParameters();
- predefined.reportInternalParameterizationErrors(chain);
+ @Override
+ public Settings makeInstance() {
+ return settings;
}
}
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * COPAC settings.
+ */
+ protected COPAC.Settings settings;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ settings = config.tryInstantiate(COPAC.Settings.class);
+ }
@Override
- protected COPAC<V, D> makeInstance() {
- return new COPAC<>(pdistI, algC, algO);
+ protected COPAC<V> makeInstance() {
+ return new COPAC<>(settings);
}
}
} \ No newline at end of file