diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java | 681 |
1 files changed, 424 insertions, 257 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index cd5e51b8..20849dd6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,8 +23,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.map.hash.TCustomHashMap; +import gnu.trove.procedure.TObjectObjectProcedure; + import java.util.ArrayList; -import java.util.BitSet; import java.util.Collection; import java.util.Collections; import java.util.Comparator; @@ -34,7 +36,9 @@ import java.util.List; import java.util.Map; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS; +import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.ClusterOrderResult; +import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.CorrelationClusterOrderEntry; +import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.GeneralizedOPTICS; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -42,26 +46,20 @@ import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DiSHDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.AbstractDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelationDistance; import de.lmu.ifi.dbs.elki.index.preprocessed.preference.DiSHPreferenceVectorIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid; -import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; -import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; -import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -73,7 +71,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -93,7 +90,6 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * @author Elke Achtert * * @apiviz.uses DiSHPreferenceVectorIndex - * @apiviz.uses DiSHDistanceFunction * @apiviz.has SubspaceModel * * @param <V> the type of NumberVector handled by this Algorithm @@ -101,94 +97,61 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("DiSH: Detecting Subspace cluster Hierarchies") @Description("Algorithm to find hierarchical correlation clusters in subspaces.") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15") -public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class DiSH<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(DiSH.class); /** - * Parameter that specifies the maximum radius of the neighborhood to be - * considered in each dimension for determination of the preference vector, - * must be a double equal to or greater than 0. - * <p> - * Default value: {@code 0.001} - * </p> - * <p> - * Key: {@code -dish.epsilon} - * </p> - */ - public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", "The maximum radius of the neighborhood " + "to be considered in each dimension for determination of " + "the preference vector."); - - /** - * Parameter that specifies the a minimum number of points as a smoothing - * factor to avoid the single-link-effect, must be an integer greater than 0. - * <p> - * Default value: {@code 1} - * </p> - * <p> - * Key: {@code -dish.mu} - * </p> - */ - public static final OptionID MU_ID = new OptionID("dish.mu", "The minimum number of points as a smoothing factor to avoid the single-link-effekt."); - - /** - * Holds the value of {@link #EPSILON_ID}. + * Holds the value of {@link Parameterizer#EPSILON_ID}. */ private double epsilon; /** - * The distance function we use + * The DiSH preprocessor. */ - private DiSHDistanceFunction dishDistance; + private DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor; /** - * Parameters that were given to OPTICS + * OPTICS minPts parameter. */ - private Collection<Pair<OptionID, Object>> opticsAlgorithmParameters; + private int mu; /** * Constructor. * * @param epsilon Epsilon value - * @param dishDistance Distance function - * @param opticsAlgorithmParameters OPTICS parameters + * @param mu Mu parameter (minPts) + * @param dishPreprocessor DiSH preprocessor */ - public DiSH(double epsilon, DiSHDistanceFunction dishDistance, Collection<Pair<OptionID, Object>> opticsAlgorithmParameters) { + public DiSH(double epsilon, int mu, DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor) { super(); this.epsilon = epsilon; - this.dishDistance = dishDistance; - this.opticsAlgorithmParameters = opticsAlgorithmParameters; + this.mu = mu; + this.dishPreprocessor = dishPreprocessor; } /** * Performs the DiSH algorithm on the given database. * - * @param database Database to process * @param relation Relation to process */ - public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { - // Instantiate DiSH distance (and thus run the preprocessor) + public Clustering<SubspaceModel> run(Relation<V> relation) { if(LOG.isVerbose()) { - LOG.verbose("*** Run DiSH preprocessor."); + LOG.verbose("Running the DiSH preprocessor."); } - DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation); - // Configure and run OPTICS. + DiSHPreferenceVectorIndex<V> indexinst = dishPreprocessor.instantiate(relation); if(LOG.isVerbose()) { - LOG.verbose("*** Run OPTICS algorithm."); + LOG.verbose("Running the OPTICS algorithm."); } - ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters); - opticsconfig.addParameter(OPTICS.DISTANCE_FUNCTION_ID, ProxyDistanceFunction.proxy(dishDistanceQuery)); - Class<OPTICS<V, PreferenceVectorBasedCorrelationDistance>> cls = ClassGenericsUtil.uglyCastIntoSubclass(OPTICS.class); - OPTICS<V, PreferenceVectorBasedCorrelationDistance> optics = null; - optics = opticsconfig.tryInstantiate(cls); - ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation); + ClusterOrderResult<DiSHClusterOrderEntry> opticsResult = new DiSHOPTICS(indexinst).run(relation); if(LOG.isVerbose()) { - LOG.verbose("*** Compute Clusters."); + LOG.verbose("Compute Clusters."); } - return computeClusters(relation, opticsResult, dishDistanceQuery); + return computeClusters(relation, opticsResult); } /** @@ -196,58 +159,68 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * * @param database the database holding the objects * @param clusterOrder the cluster order - * @param distFunc Distance function */ - private Clustering<SubspaceModel<V>> computeClusters(Relation<V> database, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder, DiSHDistanceFunction.Instance<V> distFunc) { - int dimensionality = RelationUtil.dimensionality(database); - int minpts = dishDistance.getMinpts(); + private Clustering<SubspaceModel> computeClusters(Relation<V> database, ClusterOrderResult<DiSHClusterOrderEntry> clusterOrder) { + final int dimensionality = RelationUtil.dimensionality(database); // extract clusters - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder); + TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = extractClusters(database, clusterOrder); if(LOG.isVerbose()) { - StringBuilder msg = new StringBuilder("Step 1: extract clusters"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); + final StringBuilder msg = new StringBuilder("Step 1: extract clusters\n"); + clustersMap.forEachEntry(new TObjectObjectProcedure<long[], List<ArrayModifiableDBIDs>>() { + @Override + public boolean execute(long[] key, List<ArrayModifiableDBIDs> clusters) { + msg.append(BitsUtil.toStringLow(key, dimensionality)).append(" sizes:"); + for(ArrayModifiableDBIDs c : clusters) { + msg.append(' ').append(c.size()); + } + msg.append('\n'); + return true; // continue } - } + }); LOG.verbose(msg.toString()); } // check if there are clusters < minpts - checkClusters(database, distFunc, clustersMap, minpts); + checkClusters(database, clustersMap); if(LOG.isVerbose()) { - StringBuilder msg = new StringBuilder("Step 2: check clusters"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); + final StringBuilder msg = new StringBuilder("Step 2: check clusters\n"); + clustersMap.forEachEntry(new TObjectObjectProcedure<long[], List<ArrayModifiableDBIDs>>() { + @Override + public boolean execute(long[] key, List<ArrayModifiableDBIDs> clusters) { + msg.append(BitsUtil.toStringLow(key, dimensionality)).append(" sizes:"); + for(ArrayModifiableDBIDs c : clusters) { + msg.append(' ').append(c.size()); + } + msg.append('\n'); + return true; // continue } - } + }); LOG.verbose(msg.toString()); } // sort the clusters - List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap); + List<Cluster<SubspaceModel>> clusters = sortClusters(database, clustersMap); if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 3: sort clusters"); - for(Cluster<SubspaceModel<V>> c : clusters) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size()); + for(Cluster<SubspaceModel> c : clusters) { + msg.append('\n').append(BitsUtil.toStringLow(c.getModel().getSubspace().getDimensions(), dimensionality)).append(" ids ").append(c.size()); } LOG.verbose(msg.toString()); } // build the hierarchy - Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); - buildHierarchy(database, distFunc, clustering, clusters, dimensionality); + Clustering<SubspaceModel> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); + buildHierarchy(database, clustering, clusters, dimensionality); if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 4: build hierarchy"); - for(Cluster<SubspaceModel<V>> c : clusters) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); - for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { + for(Cluster<SubspaceModel> c : clusters) { + msg.append('\n').append(BitsUtil.toStringLow(c.getModel().getSubspace().getDimensions(), dimensionality)).append(" ids ").append(c.size()); + for(Iter<Cluster<SubspaceModel>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { msg.append("\n parent ").append(iter.get()); } - for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { + for(Iter<Cluster<SubspaceModel>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { msg.append("\n child ").append(iter.get()); } } @@ -255,7 +228,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // build result - for(Cluster<SubspaceModel<V>> c : clusters) { + for(Cluster<SubspaceModel> c : clusters) { if(clustering.getClusterHierarchy().numParents(c) == 0) { clustering.addToplevelCluster(c); } @@ -267,37 +240,38 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Extracts the clusters from the cluster order. * * @param database the database storing the objects - * @param distFunc the distance function * @param clusterOrder the cluster order to extract the clusters from * @return the extracted clusters */ - private Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> extractClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder) { + private TCustomHashMap<long[], List<ArrayModifiableDBIDs>> extractClusters(Relation<V> database, ClusterOrderResult<DiSHClusterOrderEntry> clusterOrder) { FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), LOG) : null; int processed = 0; - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>(); - Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>(); - Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); - for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { - ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next(); + TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = new TCustomHashMap<>(BitsUtil.TROVE_HASH_STRATEGY); + // Note clusterOrder currently contains DBID objects anyway. + Map<DBID, DiSHClusterOrderEntry> entryMap = new HashMap<>(); + Map<DBID, Pair<long[], ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); + for(Iterator<DiSHClusterOrderEntry> it = clusterOrder.iterator(); it.hasNext();) { + DiSHClusterOrderEntry entry = it.next(); entryMap.put(entry.getID(), entry); V object = database.get(entry.getID()); - BitSet preferenceVector = entry.getReachability().getCommonPreferenceVector(); + long[] preferenceVector = entry.getCommonPreferenceVector(); // get the list of (parallel) clusters for the preference vector - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector); + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(preferenceVector); if(parallelClusters == null) { parallelClusters = new ArrayList<>(); clustersMap.put(preferenceVector, parallelClusters); } // look for the proper cluster - Pair<BitSet, ArrayModifiableDBIDs> cluster = null; - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database); - PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector); - if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { - double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector()); + ArrayModifiableDBIDs cluster = null; + for(ArrayModifiableDBIDs c : parallelClusters) { + Vector c_centroid = ProjectedCentroid.make(preferenceVector, database, c); + long[] commonPreferenceVector = BitsUtil.andCMin(preferenceVector, preferenceVector); + int subspaceDim = subspaceDimensionality(object, c_centroid, preferenceVector, preferenceVector, commonPreferenceVector); + if(subspaceDim == entry.getCorrelationValue()) { + double d = weightedDistance(object, c_centroid, commonPreferenceVector); if(d <= 2 * epsilon) { cluster = c; break; @@ -305,57 +279,56 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } if(cluster == null) { - cluster = new Pair<>(preferenceVector, DBIDUtil.newArray()); + cluster = DBIDUtil.newArray(); parallelClusters.add(cluster); } - cluster.second.add(entry.getID()); - entryToClusterMap.put(entry.getID(), cluster); + cluster.add(entry.getID()); + entryToClusterMap.put(entry.getID(), new Pair<>(preferenceVector, cluster)); if(progress != null) { progress.setProcessed(++processed, LOG); } } - if(progress != null) { - progress.ensureCompleted(LOG); - } + LOG.ensureCompleted(progress); if(LOG.isDebuggingFiner()) { + int dim = RelationUtil.dimensionality(database); StringBuilder msg = new StringBuilder("Step 0"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size()); + for(Map.Entry<long[], List<ArrayModifiableDBIDs>> clusterList : clustersMap.entrySet()) { + for(ArrayModifiableDBIDs c : clusterList.getValue()) { + msg.append('\n').append(BitsUtil.toStringLow(clusterList.getKey(), dim)).append(" ids ").append(c.size()); } } LOG.debugFiner(msg.toString()); } // add the predecessor to the cluster - for(BitSet pv : clustersMap.keySet()) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { - if(cluster.second.isEmpty()) { + for(long[] pv : clustersMap.keySet()) { + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); + for(ArrayModifiableDBIDs cluster : parallelClusters) { + if(cluster.isEmpty()) { continue; } - DBID firstID = cluster.second.get(0); - ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID); + DBID firstID = cluster.get(0); + DiSHClusterOrderEntry entry = entryMap.get(firstID); DBID predecessorID = entry.getPredecessorID(); if(predecessorID == null) { continue; } - ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID); + DiSHClusterOrderEntry predecessor = entryMap.get(predecessorID); // parallel cluster - if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { + if(BitsUtil.equal(predecessor.getCommonPreferenceVector(), entry.getCommonPreferenceVector())) { continue; } - if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) { + if(predecessor.compareTo(entry) < 0) { continue; } - Pair<BitSet, ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID); + Pair<long[], ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID); oldCluster.second.remove(predecessorID); - cluster.second.add(predecessorID); + cluster.add(predecessorID); entryToClusterMap.remove(predecessorID); - entryToClusterMap.put(predecessorID, cluster); + entryToClusterMap.put(predecessorID, new Pair<>(pv, cluster)); } } @@ -366,21 +339,21 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Returns a sorted list of the clusters w.r.t. the subspace dimensionality in * descending order. * - * @param database the database storing the objects + * @param relation the database storing the objects * @param clustersMap the mapping of bits sets to clusters * @return a sorted list of the clusters */ - private List<Cluster<SubspaceModel<V>>> sortClusters(Relation<V> database, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { - final int db_dim = RelationUtil.dimensionality(database); + private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) { + final int db_dim = RelationUtil.dimensionality(relation); // int num = 1; - List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>(); - for(BitSet pv : clustersMap.keySet()) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); + List<Cluster<SubspaceModel>> clusters = new ArrayList<>(); + for(long[] pv : clustersMap.keySet()) { + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); for(int i = 0; i < parallelClusters.size(); i++) { - Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i); - Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second); - cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); - String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, ""); + ArrayModifiableDBIDs c = parallelClusters.get(i); + Cluster<SubspaceModel> cluster = new Cluster<>(c); + cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c))); + String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim); if(parallelClusters.size() > 1) { cluster.setName("Cluster_" + subspace + "_" + i); } @@ -391,12 +364,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } // sort the clusters w.r.t. lambda - Comparator<Cluster<SubspaceModel<V>>> comparator = new Comparator<Cluster<SubspaceModel<V>>>() { + Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() { @Override - public int compare(Cluster<SubspaceModel<V>> c1, Cluster<SubspaceModel<V>> c2) { + public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) { return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality(); } - }; Collections.sort(clusters, comparator); return clusters; @@ -406,32 +378,31 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Removes the clusters with size < minpts from the cluster map and adds them * to their parents. * - * @param database the database storing the objects - * @param distFunc the distance function + * @param relation the relation storing the objects * @param clustersMap the map containing the clusters - * @param minpts MinPts */ - private void checkClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap, int minpts) { + private void checkClusters(Relation<V> relation, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) { + final int dimensionality = RelationUtil.dimensionality(relation); // check if there are clusters < minpts // and add them to not assigned - List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>(); - Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray()); - for(BitSet pv : clustersMap.keySet()) { + List<Pair<long[], ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); + TCustomHashMap<long[], List<ArrayModifiableDBIDs>> newClustersMap = new TCustomHashMap<>(BitsUtil.TROVE_HASH_STRATEGY); + Pair<long[], ArrayModifiableDBIDs> noise = new Pair<>(BitsUtil.zero(dimensionality), DBIDUtil.newArray()); + for(long[] pv : clustersMap.keySet()) { // noise - if(pv.cardinality() == 0) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - noise.second.addDBIDs(c.second); + if(BitsUtil.cardinality(pv) == 0) { + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); + for(ArrayModifiableDBIDs c : parallelClusters) { + noise.second.addDBIDs(c); } } // clusters else { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size()); - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - if(!pv.equals(new BitSet()) && c.second.size() < minpts) { - notAssigned.add(c); + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); + List<ArrayModifiableDBIDs> newParallelClusters = new ArrayList<>(parallelClusters.size()); + for(ArrayModifiableDBIDs c : parallelClusters) { + if(!BitsUtil.isZero(pv) && c.size() < mu) { + notAssigned.add(new Pair<>(pv, c)); } else { newParallelClusters.add(c); @@ -444,11 +415,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin clustersMap.clear(); clustersMap.putAll(newClustersMap); - for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { + for(Pair<long[], ArrayModifiableDBIDs> c : notAssigned) { if(c.second.isEmpty()) { continue; } - Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap); + Pair<long[], ArrayModifiableDBIDs> parent = findParent(relation, c, clustersMap); if(parent != null) { parent.second.addDBIDs(c.second); } @@ -457,30 +428,29 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } - List<Pair<BitSet, ArrayModifiableDBIDs>> noiseList = new ArrayList<>(1); - noiseList.add(noise); + List<ArrayModifiableDBIDs> noiseList = new ArrayList<>(1); + noiseList.add(noise.second); clustersMap.put(noise.first, noiseList); } /** * Returns the parent of the specified cluster * - * @param database the database storing the objects - * @param distFunc the distance function + * @param relation the relation storing the objects * @param child the child to search the parent for * @param clustersMap the map containing the clusters * @return the parent of the specified cluster */ - private Pair<BitSet, ArrayModifiableDBIDs> findParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Pair<BitSet, ArrayModifiableDBIDs> child, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { - V child_centroid = ProjectedCentroid.make(child.first, database, child.second).toVector(database); + private Pair<long[], ArrayModifiableDBIDs> findParent(Relation<V> relation, Pair<long[], ArrayModifiableDBIDs> child, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) { + Vector child_centroid = ProjectedCentroid.make(child.first, relation, child.second); - Pair<BitSet, ArrayModifiableDBIDs> result = null; + Pair<long[], ArrayModifiableDBIDs> result = null; int resultCardinality = -1; - BitSet childPV = child.first; - int childCardinality = childPV.cardinality(); - for(BitSet parentPV : clustersMap.keySet()) { - int parentCardinality = parentPV.cardinality(); + long[] childPV = child.first; + int childCardinality = BitsUtil.cardinality(childPV); + for(long[] parentPV : clustersMap.keySet()) { + int parentCardinality = BitsUtil.cardinality(parentPV); if(parentCardinality >= childCardinality) { continue; } @@ -488,15 +458,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin continue; } - BitSet pv = (BitSet) childPV.clone(); - pv.and(parentPV); + long[] pv = BitsUtil.andCMin(childPV, parentPV); if(pv.equals(parentPV)) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV); - for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { - V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database); - double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV); + List<ArrayModifiableDBIDs> parentList = clustersMap.get(parentPV); + for(ArrayModifiableDBIDs parent : parentList) { + Vector parent_centroid = ProjectedCentroid.make(parentPV, relation, parent); + double d = weightedDistance(child_centroid, parent_centroid, parentPV); if(d <= 2 * epsilon) { - result = parent; + result = new Pair<>(parentPV, parent); resultCardinality = parentCardinality; break; } @@ -510,65 +479,70 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin /** * Builds the cluster hierarchy. * - * @param distFunc the distance function * @param clustering Clustering we process * @param clusters the sorted list of clusters * @param dimensionality the dimensionality of the data * @param database the database containing the data objects */ - private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Clustering<SubspaceModel<V>> clustering, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) { - StringBuilder msg = new StringBuilder(); + private void buildHierarchy(Relation<V> database, Clustering<SubspaceModel> clustering, List<Cluster<SubspaceModel>> clusters, int dimensionality) { + StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null; final int db_dim = RelationUtil.dimensionality(database); - Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy(); + Hierarchy<Cluster<SubspaceModel>> hier = clustering.getClusterHierarchy(); for(int i = 0; i < clusters.size() - 1; i++) { - Cluster<SubspaceModel<V>> c_i = clusters.get(i); - int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality(); - V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database); + Cluster<SubspaceModel> c_i = clusters.get(i); + final Subspace s_i = c_i.getModel().getSubspace(); + int subspaceDim_i = dimensionality - s_i.dimensionality(); + Vector ci_centroid = ProjectedCentroid.make(s_i.getDimensions(), database, c_i.getIDs()); + long[] pv1 = s_i.getDimensions(); for(int j = i + 1; j < clusters.size(); j++) { - Cluster<SubspaceModel<V>> c_j = clusters.get(j); - int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality(); + Cluster<SubspaceModel> c_j = clusters.get(j); + final Subspace s_j = c_j.getModel().getSubspace(); + int subspaceDim_j = dimensionality - s_j.dimensionality(); if(subspaceDim_i < subspaceDim_j) { - if(LOG.isDebugging()) { - msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']'); - msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']'); + if(msg != null) { + msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)).append(']'); + msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)).append(']'); } // noise level reached - if(c_j.getModel().getSubspace().dimensionality() == 0) { + if(s_j.dimensionality() == 0) { // no parents exists -> parent is noise if(hier.numParents(c_i) == 0) { clustering.addChildCluster(c_j, c_i); - if(LOG.isDebugging()) { - msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); - msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); + if(msg != null) { + msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)); + msg.append("] is parent of [").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)); msg.append(']'); } } } else { - V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database); - PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions()); - double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector()); - if(LOG.isDebugging()) { - msg.append("\n dist = ").append(distance.getCorrelationValue()); + Vector cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()); + long[] pv2 = s_j.getDimensions(); + long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2); + int subspaceDim = subspaceDimensionality(ci_centroid, cj_centroid, pv1, pv2, commonPreferenceVector); + + double d = weightedDistance(ci_centroid, cj_centroid, commonPreferenceVector); + if(msg != null) { + msg.append("\n dist = ").append(subspaceDim); } - if(distance.getCorrelationValue() == subspaceDim_j) { - if(LOG.isDebugging()) { + if(subspaceDim == subspaceDim_j) { + if(msg != null) { msg.append("\n d = ").append(d); } if(d <= 2 * epsilon) { // no parent exists or c_j is not a parent of the already // existing parents - if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { + if(hier.numParents(c_i) == 0 || !isParent(database, c_j, hier.iterParents(c_i), db_dim)) { clustering.addChildCluster(c_j, c_i); - if(LOG.isDebugging()) { - msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); + if(msg != null) { + msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)); msg.append("] is parent of ["); - msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); + msg.append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)); msg.append(']'); } } @@ -581,7 +555,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } } - if(LOG.isDebugging()) { + if(msg != null) { LOG.debug(msg.toString()); } } @@ -590,31 +564,75 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Returns true, if the specified parent cluster is a parent of one child of * the children clusters. * - * @param database the database containing the objects - * @param distFunc the distance function for distance computation between the - * clusters + * @param relation the database containing the objects * @param parent the parent to be tested * @param iter the list of children to be tested + * @param db_dim Database dimensionality * @return true, if the specified parent cluster is a parent of one child of * the children clusters, false otherwise */ - private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, Iter<Cluster<SubspaceModel<V>>> iter) { - V parent_centroid = ProjectedCentroid.make(parent.getModel().getDimensions(), database, parent.getIDs()).toVector(database); - int dimensionality = RelationUtil.dimensionality(database); - int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality(); + private boolean isParent(Relation<V> relation, Cluster<SubspaceModel> parent, Iter<Cluster<SubspaceModel>> iter, int db_dim) { + Subspace s_p = parent.getModel().getSubspace(); + Vector parent_centroid = ProjectedCentroid.make(s_p.getDimensions(), relation, parent.getIDs()); + int subspaceDim_parent = db_dim - s_p.dimensionality(); for(; iter.valid(); iter.advance()) { - Cluster<SubspaceModel<V>> child = iter.get(); - V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database); - PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions()); - if(distance.getCorrelationValue() == subspaceDim_parent) { + Cluster<SubspaceModel> child = iter.get(); + Subspace s_c = child.getModel().getSubspace(); + Vector child_centroid = ProjectedCentroid.make(s_c.getDimensions(), relation, child.getIDs()); + long[] commonPreferenceVector = BitsUtil.andCMin(s_p.getDimensions(), s_c.getDimensions()); + int subspaceDim = subspaceDimensionality(parent_centroid, child_centroid, s_p.getDimensions(), s_c.getDimensions(), commonPreferenceVector); + if(subspaceDim == subspaceDim_parent) { return true; } } - return false; } + /** + * Compute the common subspace dimensionality of two vectors. + * + * @param v1 First vector + * @param v2 Second vector + * @param pv1 First preference + * @param pv2 Second preference + * @param commonPreferenceVector Common preference + * @return Usually, v1.dim - commonPreference.cardinality, unless either pv1 + * and pv2 are a subset of the other. + */ + private int subspaceDimensionality(NumberVector v1, NumberVector v2, long[] pv1, long[] pv2, long[] commonPreferenceVector) { + // number of zero values in commonPreferenceVector + int subspaceDim = v1.getDimensionality() - BitsUtil.cardinality(commonPreferenceVector); + + // special case: v1 and v2 are in parallel subspaces + if(BitsUtil.equal(commonPreferenceVector, pv1) || BitsUtil.equal(commonPreferenceVector, pv2)) { + double d = weightedDistance(v1, v2, commonPreferenceVector); + if(d > 2 * epsilon) { + subspaceDim++; + } + } + return subspaceDim; + } + + /** + * Computes the weighted distance between the two specified vectors according + * to the given preference vector. + * + * @param v1 the first vector + * @param v2 the second vector + * @param weightVector the preference vector + * @return the weighted distance between the two specified vectors according + * to the given preference vector + */ + protected static double weightedDistance(NumberVector v1, NumberVector v2, long[] weightVector) { + double sqrDist = 0; + for(int i = BitsUtil.nextSetBit(weightVector, 0); i >= 0; i = BitsUtil.nextSetBit(weightVector, i + 1)) { + double manhattanI = v1.doubleValue(i) - v2.doubleValue(i); + sqrDist += manhattanI * manhattanI; + } + return Math.sqrt(sqrDist); + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); @@ -626,20 +644,199 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } /** + * OPTICS variant used by DiSH internally. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public class DiSHOPTICS extends GeneralizedOPTICS<V, DiSHClusterOrderEntry> { + /** + * DiSH preprocessor instance. + */ + private DiSHPreferenceVectorIndex<V> index; + + /** + * Constructor. + * + * @param indexinst Preprocessor instance. + */ + public DiSHOPTICS(DiSHPreferenceVectorIndex<V> indexinst) { + super(mu); + this.index = indexinst; + } + + @Override + public Class<? super DiSHClusterOrderEntry> getEntryType() { + return DiSHClusterOrderEntry.class; + } + + @Override + protected DiSHClusterOrderEntry makeSeedEntry(Relation<V> relation, DBID objectID) { + return new DiSHClusterOrderEntry(objectID, null, Integer.MAX_VALUE, Double.POSITIVE_INFINITY, new long[0]); + } + + @Override + protected Collection<DiSHClusterOrderEntry> getNeighborsForDBID(Relation<V> relation, DBID id) { + DBID id1 = DBIDUtil.deref(id); + long[] pv1 = index.getPreferenceVector(id1); + V dv1 = relation.get(id1); + final int dim = dv1.getDimensionality(); + + long[] ones = BitsUtil.ones(dim); + long[] inverseCommonPreferenceVector = BitsUtil.ones(dim); + + ArrayList<DiSHClusterOrderEntry> result = new ArrayList<>(); + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + long[] pv2 = index.getPreferenceVector(iter); + V dv2 = relation.get(iter); + // We need a copy of this for the distance. + long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2); + + // number of zero values in commonPreferenceVector + int subspaceDim = dim - BitsUtil.cardinality(commonPreferenceVector); + + // special case: v1 and v2 are in parallel subspaces + if(BitsUtil.equal(commonPreferenceVector, pv1) || BitsUtil.equal(commonPreferenceVector, pv2)) { + double d = weightedDistance(dv1, dv2, commonPreferenceVector); + if(d > 2 * epsilon) { + subspaceDim++; + } + } + + // flip commonPreferenceVector for distance computation in common + // subspace + System.arraycopy(ones, 0, inverseCommonPreferenceVector, 0, ones.length); + BitsUtil.xorI(inverseCommonPreferenceVector, commonPreferenceVector); + + final double orthogonalDistance = weightedDistance(dv1, dv2, inverseCommonPreferenceVector); + result.add(new DiSHClusterOrderEntry(DBIDUtil.deref(iter), id1, subspaceDim, orthogonalDistance, commonPreferenceVector)); + } + Collections.sort(result); + // This is a hack, but needed to enforce core-distance of OPTICS: + if(result.size() >= getMinPts()) { + DiSHClusterOrderEntry coredist = result.get(getMinPts() - 1); + for(int i = 0; i < getMinPts() - 1; i++) { + final DiSHClusterOrderEntry prev = result.get(i); + result.set(i, new DiSHClusterOrderEntry(prev.getID(), id1, coredist.getCorrelationValue(), coredist.getEuclideanValue(), coredist.commonPreferenceVector)); + } + } + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.DOUBLE_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + } + + /** + * Cluster order entry for DiSH. + * + * @author Elke Achtert + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class DiSHClusterOrderEntry extends CorrelationClusterOrderEntry<DiSHClusterOrderEntry> { + /** + * The common preference vector of the two objects defining this distance. + */ + private long[] commonPreferenceVector; + + /** + * Constructs a new CorrelationDistance object. + * + * @param correlationValue the correlation dimension to be represented by + * the CorrelationDistance + * @param euclideanValue the Euclidean distance to be represented by the + * CorrelationDistance + * @param commonPreferenceVector the common preference vector of the two + * objects defining this distance + */ + public DiSHClusterOrderEntry(DBID objectID, DBID predecessorID, int correlationValue, double euclideanValue, long[] commonPreferenceVector) { + super(objectID, predecessorID, correlationValue, euclideanValue); + this.commonPreferenceVector = commonPreferenceVector; + } + + /** + * Returns the common preference vector of the two objects defining this + * distance. + * + * @return the common preference vector + */ + public long[] getCommonPreferenceVector() { + return commonPreferenceVector; + } + + /** + * Returns a string representation of this + * PreferenceVectorBasedCorrelationDistance. + * + * @return the correlation value, the Euclidean value and the common + * preference vector separated by blanks + */ + @Override + public String toString() { + return super.toString() + SEPARATOR + commonPreferenceVector.toString(); + } + + @Override + public int compareTo(DiSHClusterOrderEntry other) { + return super.compareTo(other); + } + } + + /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * Parameter that specifies the maximum radius of the neighborhood to be + * considered in each dimension for determination of the preference vector, + * must be a double equal to or greater than 0. + * <p> + * Default value: {@code 0.001} + * </p> + * <p> + * Key: {@code -dish.epsilon} + * </p> + */ + public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", // + "The maximum radius of the neighborhood to be considered in each " // + + " dimension for determination of the preference vector."); + + /** + * Parameter that specifies the a minimum number of points as a smoothing + * factor to avoid the single-link-effect, must be an integer greater than + * 0. + * <p> + * Default value: {@code 1} + * </p> + * <p> + * Key: {@code -dish.mu} + * </p> + */ + public static final OptionID MU_ID = new OptionID("dish.mu", // + "The minimum number of points as a smoothing factor to avoid the single-link-effekt."); + protected double epsilon = 0.0; protected int mu = 1; - protected DiSHDistanceFunction dishDistance; - - protected Collection<Pair<OptionID, Object>> opticsO; + /** + * DiSH preprocessor. + */ + protected DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor; @Override protected void makeOptions(Parameterization config) { @@ -657,53 +854,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin mu = muP.intValue(); } - configDiSHDistance(config, epsilon, mu); - - configOPTICS(config, mu, dishDistance); + configDiSHPreprocessor(config, epsilon, mu); } - public void configDiSHDistance(Parameterization config, double epsilon, int minpts) { + public void configDiSHPreprocessor(Parameterization config, double epsilon, int minpts) { ListParameterization dishParameters = new ListParameterization(); - dishParameters.addParameter(DiSHDistanceFunction.EPSILON_ID, epsilon); - dishParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, DiSHPreferenceVectorIndex.Factory.class); - dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, Double.toString(epsilon)); + dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, epsilon); dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.MINPTS_ID, minpts); ChainedParameterization dishchain = new ChainedParameterization(dishParameters, config); dishchain.errorsTo(config); - dishDistance = dishchain.tryInstantiate(DiSHDistanceFunction.class); - } - - /** - * Get the parameters for embedded OPTICS. - * - * @param config Parameterization - * @param minpts MinPts value - * @param dishDistance DiSH distance function - */ - public void configOPTICS(Parameterization config, final int minpts, final DiSHDistanceFunction dishDistance) { - // Configure OPTICS. Tracked parameters - ListParameterization opticsParameters = new ListParameterization(); - opticsParameters.addParameter(OPTICS.EPSILON_ID, AbstractDistance.INFINITY_PATTERN); - opticsParameters.addParameter(OPTICS.MINPTS_ID, minpts); - // Configure OPTICS. Untracked parameters - ListParameterization opticsUntrackedParameters = new ListParameterization(); - opticsUntrackedParameters.addParameter(OPTICS.DISTANCE_FUNCTION_ID, dishDistance); - ChainedParameterization optchain = new ChainedParameterization(opticsParameters, config); - TrackParameters trackpar = new TrackParameters(optchain); - - ChainedParameterization optchain2 = new ChainedParameterization(opticsUntrackedParameters, trackpar); - optchain2.errorsTo(config); - - // Instantiate OPTICS for parameterization - optchain2.tryInstantiate(OPTICS.class); - // store parameters - opticsO = trackpar.getGivenParameters(); + final Class<DiSHPreferenceVectorIndex.Factory<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DiSHPreferenceVectorIndex.Factory.class); + dishPreprocessor = dishchain.tryInstantiate(cls); } @Override protected DiSH<V> makeInstance() { - return new DiSH<>(epsilon, dishDistance, opticsO); + return new DiSH<>(epsilon, mu, dishPreprocessor); } } } |