package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.COPACNeighborPredicate; import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate; import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN; import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.DimensionModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * COPAC is an algorithm to partition a database according to the correlation * dimension of its objects and to then perform an arbitrary clustering * algorithm over the partitions. * * Reference: *

* E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek:
* Robust, Complete, and Efficient Correlation Clustering.
* In Proc. 7th SIAM International Conference on Data Mining (SDM'07), * Minneapolis, MN, 2007 *

* * @author Arthur Zimek * * @apiviz.has DimensionModel * @apiviz.composedOf COPACNeighborPredicate * * @param the type of NumberVector handled by this Algorithm */ @Title("COPAC: COrrelation PArtition Clustering") @Description("Partitions a database according to the correlation dimension of its objects and performs " // + "a clustering algorithm over the partitions.") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek", // title = "Robust, Complete, and Efficient Correlation Clustering", // booktitle = "Proc. 7th SIAM International Conference on Data Mining (SDM'07), Minneapolis, MN, 2007", // url = "http://www.siam.org/proceedings/datamining/2007/dm07_037achtert.pdf") public class COPAC extends AbstractAlgorithm> implements ClusteringAlgorithm> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(COPAC.class); /** * Settings class. */ COPAC.Settings settings; /** * Constructor. * * @param settings COPAC parameters */ public COPAC(COPAC.Settings settings) { super(); this.settings = settings; } /** * Run the COPAC algorithm. * * @param database Database * @param relation Vector field relation * @return COPAC clustering */ public Clustering run(Database database, Relation relation) { COPACNeighborPredicate.Instance npred = new COPACNeighborPredicate(settings).instantiate(database, relation); CorePredicate.Instance cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database, TypeUtil.DBIDS); Clustering dclusters = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run(); // Re-wrap the detected clusters for COPAC: Clustering result = new Clustering<>("COPAC clustering", "copac-clustering"); // Generalized DBSCAN clusterings will be flat. for(Hierarchy.Iter> iter = dclusters.iterToplevelClusters(); iter.valid(); iter.advance()) { Cluster clus = iter.get(); if(clus.size() > 0) { int dim = npred.dimensionality(clus.getIDs().iter()); DimensionModel model = new DimensionModel(dim); result.addToplevelCluster(new Cluster<>(clus.getIDs(), model)); } } return result; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return LOG; } /** * Class to wrap the COPAC settings. * * @author Erich Schubert * * @apiviz.exclude */ public static class Settings { /** * Neighborhood size. */ public int k; /** * Class to compute PCA. */ public PCAFilteredRunner pca; /** * Epsilon value for GDBSCAN. */ public double epsilon; /** * MinPts parameter. */ public int minpts; /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { /** * Size for the kNN neighborhood used in the PCA step of COPAC. */ public static final OptionID K_ID = new OptionID("copac.knn", "Number of neighbors to use for PCA."); /** * Settings to build. */ Settings settings; @Override public void makeOptions(Parameterization config) { settings = new Settings(); configK(config); // TODO: allow using other PCA runners? settings.pca = config.tryInstantiate(PCAFilteredRunner.class); configEpsilon(config); configMinPts(config); } /** * Configure the kNN parameter. * * @param config Parameter source */ protected void configK(Parameterization config) { IntParameter kP = new IntParameter(K_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(kP)) { settings.k = kP.intValue(); } } /** * Configure the epsilon radius parameter. * * @param config Parameter source */ protected void configEpsilon(Parameterization config) { DoubleParameter epsilonP = new DoubleParameter(DBSCAN.Parameterizer.EPSILON_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); if(config.grab(epsilonP)) { settings.epsilon = epsilonP.doubleValue(); } } /** * Configure the minPts aka "mu" parameter. * * @param config Parameter source */ protected void configMinPts(Parameterization config) { IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { settings.minpts = minptsP.intValue(); } } @Override public Settings makeInstance() { return settings; } } } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { /** * COPAC settings. */ protected COPAC.Settings settings; @Override protected void makeOptions(Parameterization config) { settings = config.tryInstantiate(COPAC.Settings.class); } @Override protected COPAC makeInstance() { return new COPAC<>(settings); } } }