package de.lmu.ifi.dbs.elki.algorithm.clustering; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2012 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.ClusterModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides an abstract algorithm requiring a VarianceAnalysisPreprocessor. * * @author Arthur Zimek * @param the type of NumberVector handled by this Algorithm */ public abstract class AbstractProjectedDBSCAN, V extends NumberVector> extends AbstractAlgorithm implements ClusteringAlgorithm { /** * Parameter to specify the distance function to determine the distance * between database objects, must extend * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction} * . *

* Key: {@code -projdbscan.distancefunction} *

*

* Default value: * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction} *

*/ public static final OptionID OUTER_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("projdbscan.outerdistancefunction", "Distance function to determine the distance between database objects."); /** * Parameter distance function */ public static final OptionID INNER_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("projdbscan.distancefunction", "Distance function to determine the neighbors for variance analysis."); /** * Parameter to specify the maximum radius of the neighborhood to be * considered, must be suitable to {@link LocallyWeightedDistanceFunction}. *

* Key: {@code -projdbscan.epsilon} *

*/ public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("projdbscan.epsilon", "The maximum radius of the neighborhood to be considered."); /** * Parameter to specify the intrinsic dimensionality of the clusters to find, * must be an integer greater than 0. *

* Key: {@code -projdbscan.lambda} *

*/ public static final OptionID LAMBDA_ID = OptionID.getOrCreateOptionID("projdbscan.lambda", "The intrinsic dimensionality of the clusters to find."); /** * Parameter to specify the threshold for minimum number of points in the * epsilon-neighborhood of a point, must be an integer greater than 0. *

* Key: {@code -projdbscan.minpts} *

*/ public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("projdbscan.minpts", "Threshold for minimum number of points in " + "the epsilon-neighborhood of a point."); /** * Holds the instance of the distance function specified by * {@link #INNER_DISTANCE_FUNCTION_ID}. */ private LocallyWeightedDistanceFunction distanceFunction; /** * Holds the value of {@link #EPSILON_ID}. */ protected DoubleDistance epsilon; /** * Holds the value of {@link #LAMBDA_ID}. */ private int lambda; /** * Holds the value of {@link #MINPTS_ID}. */ protected int minpts = 1; /** * Holds a list of clusters found. */ private List resultList; /** * Holds a set of noise. */ private ModifiableDBIDs noise; /** * Holds a set of processed ids. */ private ModifiableDBIDs processedIDs; /** * Constructor. * * @param epsilon Epsilon * @param minpts MinPts parameter * @param distanceFunction Outer distance function * @param lambda Lambda value */ public AbstractProjectedDBSCAN(DoubleDistance epsilon, int minpts, LocallyWeightedDistanceFunction distanceFunction, int lambda) { super(); this.epsilon = epsilon; this.minpts = minpts; this.distanceFunction = distanceFunction; this.lambda = lambda; } public Clustering run(Database database, Relation relation) throws IllegalStateException { FiniteProgress objprog = getLogger().isVerbose() ? new FiniteProgress("Processing objects", relation.size(), getLogger()) : null; IndefiniteProgress clusprog = getLogger().isVerbose() ? new IndefiniteProgress("Number of clusters", getLogger()) : null; resultList = new ArrayList(); noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(relation.size()); LocallyWeightedDistanceFunction.Instance distFunc = distanceFunction.instantiate(relation); RangeQuery rangeQuery = database.getRangeQuery(distFunc); if(relation.size() >= minpts) { for(DBID id : relation.iterDBIDs()) { if(!processedIDs.contains(id)) { expandCluster(distFunc, rangeQuery, id, objprog, clusprog); if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } } } else { for(DBID id : relation.iterDBIDs()) { noise.add(id); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } } } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } Clustering result = new Clustering(getLongResultName(), getShortResultName()); for(Iterator resultListIter = resultList.iterator(); resultListIter.hasNext();) { Cluster c = new Cluster(resultListIter.next(), ClusterModel.CLUSTER); result.addCluster(c); } Cluster n = new Cluster(noise, true, ClusterModel.CLUSTER); result.addCluster(n); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } // Signal that the progress has completed. if(objprog != null && clusprog != null) { objprog.ensureCompleted(getLogger()); clusprog.setCompleted(getLogger()); } return result; } /** * Return the long result name. * * @return Long name for result */ public abstract String getLongResultName(); /** * Return the short result name. * * @return Short name for result */ public abstract String getShortResultName(); /** * ExpandCluster function of DBSCAN. * * @param distFunc Distance query to use * @param rangeQuery Range query * @param startObjectID the object id of the database object to start the * expansion with * @param objprog the progress object for logging the current status */ protected void expandCluster(LocallyWeightedDistanceFunction.Instance distFunc, RangeQuery rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { Integer corrDim = distFunc.getIndex().getLocalProjection(startObjectID).getCorrelationDimension(); if(getLogger().isDebugging()) { getLogger().debugFine("EXPAND CLUSTER id = " + startObjectID + " " + corrDim + "\n#clusters: " + resultList.size()); } // euclidean epsilon neighborhood < minpts OR local dimensionality > // lambda -> noise if(corrDim == null || corrDim > lambda) { noise.add(startObjectID); processedIDs.add(startObjectID); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } return; } // compute weighted epsilon neighborhood List> seeds = rangeQuery.getRangeForDBID(startObjectID, epsilon); // neighbors < minPts -> noise if(seeds.size() < minpts) { noise.add(startObjectID); processedIDs.add(startObjectID); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } return; } // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DistanceResultPair seed : seeds) { DBID nextID = seed.getDBID(); Integer nextID_corrDim = distFunc.getIndex().getLocalProjection(nextID).getCorrelationDimension(); // nextID is not reachable from start object if(nextID_corrDim > lambda) { continue; } if(!processedIDs.contains(nextID)) { currentCluster.add(nextID); processedIDs.add(nextID); } else if(noise.contains(nextID)) { currentCluster.add(nextID); noise.remove(nextID); } } seeds.remove(0); while(seeds.size() > 0) { DBID q = seeds.remove(0).getDBID(); Integer corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension(); // q forms no lambda-dim hyperplane if(corrDim_q > lambda) { continue; } List> reachables = rangeQuery.getRangeForDBID(q, epsilon); if(reachables.size() > minpts) { for(DistanceResultPair r : reachables) { Integer corrDim_r = distFunc.getIndex().getLocalProjection(r.getDBID()).getCorrelationDimension(); // r is not reachable from q if(corrDim_r > lambda) { continue; } boolean inNoise = noise.contains(r.getDBID()); boolean unclassified = !processedIDs.contains(r.getDBID()); if(inNoise || unclassified) { if(unclassified) { seeds.add(r); } currentCluster.add(r.getDBID()); processedIDs.add(r.getDBID()); if(inNoise) { noise.remove(r.getDBID()); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size(); clusprog.setProcessed(numClusters, getLogger()); } } } } /* if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } */ } if(currentCluster.size() >= minpts) { resultList.add(currentCluster); } else { for(DBID id : currentCluster) { noise.add(id); } noise.add(startObjectID); processedIDs.add(startObjectID); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); } } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(distanceFunction.getInputTypeRestriction()); } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static abstract class Parameterizer, D extends Distance> extends AbstractParameterizer { protected DistanceFunction innerdist; protected D epsilon; protected LocallyWeightedDistanceFunction outerdist; protected int minpts = -1; protected Integer lambda; protected void configInnerDistance(Parameterization config) { ObjectParameter> innerdistP = new ObjectParameter>(AbstractProjectedDBSCAN.INNER_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); if(config.grab(innerdistP)) { innerdist = innerdistP.instantiateClass(config); } } protected void configEpsilon(Parameterization config, DistanceFunction innerdist) { D distanceParser = innerdist != null ? innerdist.getDistanceFactory() : null; DistanceParameter epsilonP = new DistanceParameter(EPSILON_ID, distanceParser); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } } protected void configMinPts(Parameterization config) { IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0)); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } } protected void configOuterDistance(Parameterization config, D epsilon, int minpts, Class preprocessorClass, DistanceFunction innerdist) { ObjectParameter> outerdistP = new ObjectParameter>(OUTER_DISTANCE_FUNCTION_ID, LocallyWeightedDistanceFunction.class, LocallyWeightedDistanceFunction.class); if(config.grab(outerdistP)) { // parameters for the distance function ListParameterization distanceFunctionParameters = new ListParameterization(); // distanceFunctionParameters.addFlag(PreprocessorHandler.OMIT_PREPROCESSING_ID); distanceFunctionParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, preprocessorClass); distanceFunctionParameters.addParameter(AbstractProjectedDBSCAN.INNER_DISTANCE_FUNCTION_ID, innerdist); distanceFunctionParameters.addParameter(AbstractProjectedDBSCAN.EPSILON_ID, epsilon); distanceFunctionParameters.addParameter(AbstractProjectedDBSCAN.MINPTS_ID, minpts); ChainedParameterization combinedConfig = new ChainedParameterization(distanceFunctionParameters, config); combinedConfig.errorsTo(config); outerdist = outerdistP.instantiateClass(combinedConfig); } } protected void configLambda(Parameterization config) { IntParameter lambdaP = new IntParameter(LAMBDA_ID, new GreaterConstraint(0)); if(config.grab(lambdaP)) { lambda = lambdaP.getValue(); } } } }