package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.List; import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.AbstractDimensionsSelectingDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionsSelectingEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** *

* Implementation of the SUBCLU algorithm, an algorithm to detect arbitrarily * shaped and positioned clusters in subspaces. SUBCLU delivers for each * subspace the same clusters DBSCAN would have found, when applied to this * subspace separately. *

* Reference:
* K. Kailing, H.-P. Kriegel, P. Kroeger: Density connected Subspace Clustering * for High Dimensional Data.
* In Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004. *

* * @author Elke Achtert * * @apiviz.uses DBSCAN * @apiviz.uses DimensionsSelectingEuclideanDistanceFunction * @apiviz.has SubspaceModel * * @param the type of FeatureVector handled by this Algorithm */ @Title("SUBCLU: Density connected Subspace Clustering") @Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.") @Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004") public class SUBCLU> extends AbstractAlgorithm>> implements ClusteringAlgorithm>> { /** * The logger for this class. */ private static final Logging logger = Logging.getLogger(SUBCLU.class); /** * The distance function to determine the distance between database objects. *

* Default value: {@link DimensionsSelectingEuclideanDistanceFunction} *

* Key: {@code -subclu.distancefunction} *

*/ public static final OptionID DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("subclu.distancefunction", "Distance function to determine the distance between database objects."); /** * Parameter to specify the maximum radius of the neighborhood to be * considered, must be suitable to * {@link AbstractDimensionsSelectingDoubleDistanceFunction}. *

* Key: {@code -subclu.epsilon} *

*/ public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("subclu.epsilon", "The maximum radius of the neighborhood to be considered."); /** * Parameter to specify the threshold for minimum number of points in the * epsilon-neighborhood of a point, must be an integer greater than 0. *

* Key: {@code -subclu.minpts} *

*/ public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("subclu.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); /** * Holds the instance of the distance function specified by * {@link #DISTANCE_FUNCTION_ID}. */ private AbstractDimensionsSelectingDoubleDistanceFunction distanceFunction; /** * Holds the value of {@link #EPSILON_ID}. */ private DoubleDistance epsilon; /** * Holds the value of {@link #MINPTS_ID}. */ private int minpts; /** * Holds the result; */ private Clustering> result; /** * Constructor. * * @param distanceFunction Distance function * @param epsilon Epsilon value * @param minpts Minpts value */ public SUBCLU(AbstractDimensionsSelectingDoubleDistanceFunction distanceFunction, DoubleDistance epsilon, int minpts) { super(); this.distanceFunction = distanceFunction; this.epsilon = epsilon; this.minpts = minpts; } /** * Performs the SUBCLU algorithm on the given database. * * @param relation Relation to process * @return Clustering result */ public Clustering> run(Relation relation) throws IllegalStateException { final int dimensionality = DatabaseUtil.dimensionality(relation); StepProgress stepprog = logger.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters if(stepprog != null) { stepprog.beginStep(1, "Generate all 1-dimensional clusters.", logger); } // mapping of dimensionality to set of subspaces HashMap>> subspaceMap = new HashMap>>(); // list of 1-dimensional subspaces containing clusters List> s_1 = new ArrayList>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters TreeMap, List>> clusterMap = new TreeMap, List>>(new Subspace.DimensionComparator()); for(int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List> clusters = runDBSCAN(relation, null, currentSubspace); if(logger.isDebuggingFiner()) { StringBuffer msg = new StringBuffer(); msg.append("\n").append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n"); for(Cluster cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } logger.debugFiner(msg.toString()); } if(!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters for(int d = 0; d < dimensionality - 1; d++) { if(stepprog != null) { stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", logger); } List> subspaces = subspaceMap.get(d); if(subspaces == null || subspaces.isEmpty()) { if(stepprog != null) { for(int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", logger); } } break; } List> candidates = generateSubspaceCandidates(subspaces); List> s_d = new ArrayList>(); for(Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); if(logger.isDebuggingFine()) { logger.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List> bestSubspaceClusters = clusterMap.get(bestSubspace); List> clusters = new ArrayList>(); for(Cluster cluster : bestSubspaceClusters) { List> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); if(!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } if(logger.isDebuggingFine()) { StringBuffer msg = new StringBuffer(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); for(Cluster c : clusters) { msg.append(" " + c.getIDs() + "\n"); } logger.debugFine(msg.toString()); } if(!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } if(!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } // build result int numClusters = 1; result = new Clustering>("SUBCLU clustering", "subclu-clustering"); for(Subspace subspace : clusterMap.descendingKeySet()) { List> clusters = clusterMap.get(subspace); for(Cluster cluster : clusters) { Cluster> newCluster = new Cluster>(cluster.getIDs()); newCluster.setModel(new SubspaceModel(subspace, DatabaseUtil.centroid(relation, cluster.getIDs()))); newCluster.setName("cluster_" + numClusters++); result.addCluster(newCluster); } } if(stepprog != null) { stepprog.setCompleted(logger); } return result; } /** * Returns the result of the algorithm. * * @return the result of the algorithm */ public Clustering> getResult() { return result; } /** * Runs the DBSCAN algorithm on the specified partition of the database in the * given subspace. If parameter {@code ids} is null DBSCAN will be applied to * the whole database. * * @param relation the database holding the objects to run DBSCAN on * @param ids the IDs of the database defining the partition to run DBSCAN on * - if this parameter is null DBSCAN will be applied to the whole * database * @param subspace the subspace to run DBSCAN on * @return the clustering result of the DBSCAN run */ private List> runDBSCAN(Relation relation, DBIDs ids, Subspace subspace) { // distance function distanceFunction.setSelectedDimensions(subspace.getDimensions()); ProxyDatabase proxy; if(ids == null) { // TODO: in this case, we might want to use an index - the proxy below // will prevent this! ids = relation.getDBIDs(); } proxy = new ProxyDatabase(ids, relation); DBSCAN dbscan = new DBSCAN(distanceFunction, epsilon, minpts); // run DBSCAN if(logger.isVerbose()) { logger.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); } Clustering dbsres = dbscan.run(proxy); // separate cluster and noise List> clusterAndNoise = dbsres.getAllClusters(); List> clusters = new ArrayList>(); for(Cluster c : clusterAndNoise) { if(!c.isNoise()) { clusters.add(c); } } return clusters; } /** * Generates {@code d+1}-dimensional subspace candidates from the specified * {@code d}-dimensional subspaces. * * @param subspaces the {@code d}-dimensional subspaces * @return the {@code d+1}-dimensional subspace candidates */ private List> generateSubspaceCandidates(List> subspaces) { List> candidates = new ArrayList>(); if(subspaces.isEmpty()) { return candidates; } // Generate (d+1)-dimensional candidate subspaces int d = subspaces.get(0).dimensionality(); StringBuffer msgFine = new StringBuffer("\n"); if(logger.isDebuggingFiner()) { msgFine.append("subspaces ").append(subspaces).append("\n"); } for(int i = 0; i < subspaces.size(); i++) { Subspace s1 = subspaces.get(i); for(int j = i + 1; j < subspaces.size(); j++) { Subspace s2 = subspaces.get(j); Subspace candidate = s1.join(s2); if(candidate != null) { if(logger.isDebuggingFiner()) { msgFine.append("candidate: ").append(candidate.dimensonsToString()).append("\n"); } // prune irrelevant candidate subspaces List> lowerSubspaces = lowerSubspaces(candidate); if(logger.isDebuggingFiner()) { msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append("\n"); } boolean irrelevantCandidate = false; for(Subspace s : lowerSubspaces) { if(!subspaces.contains(s)) { irrelevantCandidate = true; break; } } if(!irrelevantCandidate) { candidates.add(candidate); } } } } if(logger.isDebuggingFiner()) { logger.debugFiner(msgFine.toString()); } if(logger.isDebugging()) { StringBuffer msg = new StringBuffer(); msg.append(d + 1).append("-dimensional candidate subspaces: "); for(Subspace candidate : candidates) { msg.append(candidate.dimensonsToString()).append(" "); } logger.debug(msg.toString()); } return candidates; } /** * Returns the list of all {@code (d-1)}-dimensional subspaces of the * specified {@code d}-dimensional subspace. * * @param subspace the {@code d}-dimensional subspace * @return a list of all {@code (d-1)}-dimensional subspaces */ private List> lowerSubspaces(Subspace subspace) { int dimensionality = subspace.dimensionality(); if(dimensionality <= 1) { return null; } // order result according to the dimensions List> result = new ArrayList>(); BitSet dimensions = subspace.getDimensions(); for(int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) { BitSet newDimensions = (BitSet) dimensions.clone(); newDimensions.set(dim, false); result.add(new Subspace(newDimensions)); } return result; } /** * Determines the {@code d}-dimensional subspace of the {@code (d+1)} * -dimensional candidate with minimal number of objects in the cluster. * * @param subspaces the list of {@code d}-dimensional subspaces containing * clusters * @param candidate the {@code (d+1)}-dimensional candidate subspace * @param clusterMap the mapping of subspaces to clusters * @return the {@code d}-dimensional subspace of the {@code (d+1)} * -dimensional candidate with minimal number of objects in the * cluster */ private Subspace bestSubspace(List> subspaces, Subspace candidate, TreeMap, List>> clusterMap) { Subspace bestSubspace = null; for(Subspace subspace : subspaces) { int min = Integer.MAX_VALUE; if(subspace.isSubspace(candidate)) { List> clusters = clusterMap.get(subspace); for(Cluster cluster : clusters) { int clusterSize = cluster.size(); if(clusterSize < min) { min = clusterSize; bestSubspace = subspace; } } } } return bestSubspace; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return logger; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractParameterizer { protected int minpts = 0; protected DoubleDistance epsilon = null; protected AbstractDimensionsSelectingDoubleDistanceFunction distance = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter> param = new ObjectParameter>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, DimensionsSelectingEuclideanDistanceFunction.class); if(config.grab(param)) { distance = param.instantiateClass(config); } DistanceParameter epsilonP = new DistanceParameter(EPSILON_ID, distance); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0)); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } } @Override protected SUBCLU makeInstance() { return new SUBCLU(distance, epsilon, minpts); } } }