diff options
author | Erich Schubert <erich@debian.org> | 2014-10-31 03:43:51 +0100 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:40 +0000 |
commit | 596d8876dca5627dd76e8c23bf40a24cc305eeed (patch) | |
tree | d269ddb46561469f6b1fff67b19e0cd2b4608f5b /src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java | |
parent | ee31d687b1a0e2f2f1e6e71375c7cc3b094919b8 (diff) | |
parent | 337087b668d3a54f3afee3a9adb597a32e9f7e94 (diff) |
Import Debian changes 0.6.5~20141030-1
elki (0.6.5~20141030-1) unstable; urgency=medium
* New upstream beta release
* Urgency medium: 0.6.0 suffers from a performance issue with duplicates.
* Repackaged tarball from .jar to .tar.bz2
* Add dependency on libsvm3-java
* Enable line numbers for debugging (ant debuglevel)
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java | 393 |
1 files changed, 254 insertions, 139 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java index d535e136..87f7d519 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,50 +29,51 @@ import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.ERiCNeighborPredicate; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.CorrelationModel; -import de.lmu.ifi.dbs.elki.data.model.DimensionModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.ERiCDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.BitDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner; -import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.StandardCovarianceMatrixBuilder; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.ParameterException; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * Performs correlation clustering on the data partitioned according to local * correlation dimensionality and builds a hierarchy of correlation clusters * that allows multiple inheritance from the clustering result. + * + * Reference: * <p> - * Reference: E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek: On - * Exploring Complex Relationships of Correlation Clusters. <br> + * E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek:<br /> + * On Exploring Complex Relationships of Correlation Clusters.<br /> * In Proc. 19th International Conference on Scientific and Statistical Database * Management (SSDBM 2007), Banff, Canada, 2007. * </p> @@ -81,36 +82,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz * * @apiviz.composedOf COPAC * @apiviz.composedOf DBSCAN - * @apiviz.composedOf ERiCDistanceFunction * @apiviz.composedOf FirstNEigenPairFilter * @apiviz.composedOf PCAFilteredRunner + * @apiviz.composedOf ERiCNeighborPredicate * @apiviz.has CorrelationModel * * @param <V> the type of NumberVector handled by this Algorithm */ -// TODO: Re-use PCARunner objects somehow? @Title("ERiC: Exploring Relationships among Correlation Clusters") -@Description("Performs the DBSCAN algorithm on the data using a special distance function taking into account correlations among attributes and builds " + "a hierarchy that allows multiple inheritance from the correlation clustering result.") -@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek", title = "On Exploring Complex Relationships of Correlation Clusters", booktitle = "Proc. 19th International Conference on Scientific and Statistical Database Management (SSDBM 2007), Banff, Canada, 2007", url = "http://dx.doi.org/10.1109/SSDBM.2007.21") -public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<CorrelationModel<V>>> implements ClusteringAlgorithm<Clustering<CorrelationModel<V>>> { +@Description("Performs the DBSCAN algorithm on the data using a special distance function taking into account correlations among attributes and builds " // + + "a hierarchy that allows multiple inheritance from the correlation clustering result.") +@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek",// +title = "On Exploring Complex Relationships of Correlation Clusters", // +booktitle = "Proc. 19th International Conference on Scientific and Statistical Database Management (SSDBM 2007), Banff, Canada, 2007", // +url = "http://dx.doi.org/10.1109/SSDBM.2007.21") +public class ERiC<V extends NumberVector> extends AbstractAlgorithm<Clustering<CorrelationModel<V>>> implements ClusteringAlgorithm<Clustering<CorrelationModel<V>>> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(ERiC.class); /** - * The COPAC clustering algorithm. + * ERiC Settings. */ - private COPAC<V, IntegerDistance> copacAlgorithm; + private ERiC.Settings settings; /** * Constructor. * - * @param copacAlgorithm COPAC to use + * @param settings ERiC clustering settings */ - public ERiC(COPAC<V, IntegerDistance> copacAlgorithm) { + public ERiC(ERiC.Settings settings) { super(); - this.copacAlgorithm = copacAlgorithm; + this.settings = settings; } /** @@ -119,30 +123,27 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @param relation Relation to process * @return Clustering result */ - public Clustering<CorrelationModel<V>> run(Relation<V> relation) { + public Clustering<CorrelationModel<V>> run(Database database, Relation<V> relation) { final int dimensionality = RelationUtil.dimensionality(relation); StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null; - // run COPAC - if (stepprog != null) { - stepprog.beginStep(1, "Preprocessing local correlation dimensionalities and partitioning data", LOG); - } - Clustering<Model> copacResult = copacAlgorithm.run(relation); - - DistanceQuery<V, IntegerDistance> query = copacAlgorithm.getPartitionDistanceQuery(); + // Run Generalized DBSCAN + LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data"); + // FIXME: how to ensure we are running on the same relation? + ERiCNeighborPredicate<V>.Instance npred = new ERiCNeighborPredicate<V>(settings).instantiate(database, relation); + CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database, TypeUtil.DBIDS); + Clustering<Model> copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run(); // extract correlation clusters - if (stepprog != null) { - stepprog.beginStep(2, "Extract correlation clusters", LOG); - } - List<List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality); - if (LOG.isDebugging()) { + LOG.beginStep(stepprog, 2, "Extract correlation clusters"); + List<List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred); + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters..."); - for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) { + for(int corrDim = 0; corrDim < clusterMap.size(); corrDim++) { List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim); msg.append("\n\ncorrDim ").append(corrDim); - for (Cluster<CorrelationModel<V>> cluster : correlationClusters) { + for(Cluster<CorrelationModel<V>> cluster : correlationClusters) { msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size()); // .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex()); // msg.append("\n basis " + @@ -152,42 +153,38 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } LOG.debugFine(msg.toString()); } - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { int clusters = 0; - for (List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap) { + for(List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap) { clusters += correlationClusters.size(); } LOG.verbose(clusters + " clusters extracted."); } // build hierarchy - if (stepprog != null) { - stepprog.beginStep(3, "Building hierarchy", LOG); - } + LOG.beginStep(stepprog, 3, "Building hierarchy"); Clustering<CorrelationModel<V>> clustering = new Clustering<>("ERiC clustering", "eric-clustering"); - buildHierarchy(clustering, clusterMap, query); - if (LOG.isDebugging()) { + buildHierarchy(clustering, clusterMap, npred); + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder("Step 3: Build hierarchy"); - for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) { + for(int corrDim = 0; corrDim < clusterMap.size(); corrDim++) { List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim); - for (Cluster<CorrelationModel<V>> cluster : correlationClusters) { + for(Cluster<CorrelationModel<V>> cluster : correlationClusters) { msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size()); // .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex()); - for (Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) { + for(Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) { msg.append("\n parent ").append(iter.get()); } - for (Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) { + for(Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) { msg.append("\n child ").append(iter.get()); } } } LOG.debugFine(msg.toString()); } - if (stepprog != null) { - stepprog.setCompleted(LOG); - } + LOG.setCompleted(stepprog); - for (Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.size() - 1)) { + for(Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.size() - 1)) { clustering.addToplevelCluster(rc); } return clustering; @@ -199,16 +196,17 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * correlation dimension. Each cluster is defined by the basis vectors * defining the subspace in which the cluster appears. * - * @param copacResult + * @param dbscanResult * * @param database the database containing the objects * @param dimensionality the dimensionality of the feature space + * @param npred ERiC predicate * @return a list of clusters for each dimensionality */ - private List<List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> copacResult, Relation<V> database, int dimensionality) { + private List<List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> dbscanResult, Relation<V> database, int dimensionality, ERiCNeighborPredicate<V>.Instance npred) { // result List<List<Cluster<CorrelationModel<V>>>> clusterMap = new ArrayList<>(); - for (int i = 0; i <= dimensionality; i++) { + for(int i = 0; i <= dimensionality; i++) { clusterMap.add(new ArrayList<Cluster<CorrelationModel<V>>>()); } @@ -216,47 +214,38 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin Cluster<Model> noise = null; // iterate over correlation dimensions - for (Cluster<Model> clus : copacResult.getAllClusters()) { + for(Cluster<Model> clus : dbscanResult.getAllClusters()) { DBIDs group = clus.getIDs(); - if (clus.getModel() != null && clus.getModel() instanceof DimensionModel) { - int correlationDimension = ((DimensionModel) clus.getModel()).getDimension(); + int dim = clus.isNoise() ? dimensionality : npred.dimensionality(clus.getIDs().iter()); - ListParameterization parameters = pcaParameters(correlationDimension); - Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class); - PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls); - parameters.failOnErrors(); + if(dim < dimensionality) { + PCAFilteredRunner pca = new PCAFilteredRunner(new StandardCovarianceMatrixBuilder(), new FirstNEigenPairFilter(dim), 1., 0.); // get cluster list for this dimension. - List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(correlationDimension); + List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(dim); PCAFilteredResult pcares = pca.processIds(group, database); V centroid = Centroid.make(database, group).toVector(database); - Cluster<CorrelationModel<V>> correlationCluster = new Cluster<>("[" + correlationDimension + "_" + correlationClusters.size() + "]", group, new CorrelationModel<>(pcares, centroid)); + Cluster<CorrelationModel<V>> correlationCluster = new Cluster<>("[" + dim + "_" + correlationClusters.size() + "]", group, new CorrelationModel<>(pcares, centroid)); correlationClusters.add(correlationCluster); } // partition containing noise - else if (clus.getModel() != null && clus.isNoise()) { - if (noise == null) { + else { + if(noise == null) { noise = clus; - } else { + } + else { ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs()); merged.addDBIDs(clus.getIDs()); noise.setIDs(merged); } - } else { - throw new IllegalStateException("Unexpected group returned: " + clus.getClass().getName()); } } - if (noise != null && noise.size() > 0) { + if(noise != null && noise.size() > 0) { // get cluster list for this dimension. List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(dimensionality); - ListParameterization parameters = pcaParameters(dimensionality); - Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class); - PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls); - for (ParameterException e : parameters.getErrors()) { - LOG.warning("Error in internal parameterization: " + e.getMessage()); - } + PCAFilteredRunner pca = new PCAFilteredRunner(new StandardCovarianceMatrixBuilder(), new FirstNEigenPairFilter(dimensionality), 1., 0.); PCAFilteredResult pcares = pca.processIds(noise.getIDs(), database); V centroid = Centroid.make(database, noise.getIDs()).toVector(database); @@ -265,8 +254,8 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // Delete dimensionalities not found. - for (int i = dimensionality; i > 0; i--) { - if (clusterMap.get(i).size() > 0) { + for(int i = dimensionality; i > 0; i--) { + if(clusterMap.get(i).size() > 0) { break; } clusterMap.remove(i); @@ -275,63 +264,35 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin return clusterMap; } - /** - * Returns the parameters for the PCA for the specified correlation dimension. - * - * @param correlationDimension the correlation dimension - * @return the parameters for the PCA for the specified correlation dimension - */ - private ListParameterization pcaParameters(int correlationDimension) { - ListParameterization parameters = new ListParameterization(); - - parameters.addParameter(PCAFilteredRunner.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class); - parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, correlationDimension); - - return parameters; - } - - private void buildHierarchy(Clustering<CorrelationModel<V>> clustering, List<List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) { + private void buildHierarchy(Clustering<CorrelationModel<V>> clustering, List<List<Cluster<CorrelationModel<V>>>> clusterMap, ERiCNeighborPredicate<V>.Instance npred) { StringBuilder msg = LOG.isDebuggingFine() ? new StringBuilder() : null; Hierarchy<Cluster<CorrelationModel<V>>> hier = clustering.getClusterHierarchy(); - DBSCAN<V, DoubleDistance> dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query)); - if (dbscan == null) { - // TODO: appropriate exception class? - throw new IllegalArgumentException("ERiC was run without DBSCAN as COPAC algorithm!"); - } - DistanceFunction<? super V, ?> dfun = ProxyDistanceFunction.unwrapDistance(dbscan.getDistanceFunction()); - ERiCDistanceFunction distanceFunction = ClassGenericsUtil.castWithGenericsOrNull(ERiCDistanceFunction.class, dfun); - if (distanceFunction == null) { - // TODO: appropriate exception class? - throw new IllegalArgumentException("ERiC was run without ERiCDistanceFunction as distance function: got " + dfun.getClass()); - } // Find maximum dimensionality found: int lambda_max = clusterMap.size() - 1; - for (int childCorrDim = 0; childCorrDim < lambda_max; childCorrDim++) { + for(int childCorrDim = 0; childCorrDim < lambda_max; childCorrDim++) { List<Cluster<CorrelationModel<V>>> children = clusterMap.get(childCorrDim); - // SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> parentMap = - // clusterMap.tailMap(childCorrDim + 1); - if (msg != null) { + if(msg != null) { msg.append("\ncorrdim ").append(childCorrDim); - // msg.append("\nparents ").append(parentMap.keySet()); } - for (Cluster<CorrelationModel<V>> child : children) { - for (int parentCorrDim = childCorrDim + 1; parentCorrDim <= lambda_max; parentCorrDim++) { + for(Cluster<CorrelationModel<V>> child : children) { + for(int parentCorrDim = childCorrDim + 1; parentCorrDim <= lambda_max; parentCorrDim++) { List<Cluster<CorrelationModel<V>>> parents = clusterMap.get(parentCorrDim); - for (Cluster<CorrelationModel<V>> parent : parents) { + for(Cluster<CorrelationModel<V>> parent : parents) { int subspaceDim_parent = parent.getModel().getPCAResult().getCorrelationDimension(); - if (subspaceDim_parent == lambda_max && hier.numParents(child) == 0) { + if(subspaceDim_parent == lambda_max && hier.numParents(child) == 0) { clustering.addChildCluster(parent, child); - if (msg != null) { + if(msg != null) { msg.append('\n').append(parent).append(" is parent of ").append(child); } - } else { - BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult()); - if (!dist.bitValue() && (hier.numParents(child) == 0 || !isParent(distanceFunction, parent, hier.iterParents(child)))) { + } + else { + boolean dist = npred.weakNeighbors(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult()); + if(!dist && (hier.numParents(child) == 0 || !isParent(npred, parent, hier.iterParents(child)))) { clustering.addChildCluster(parent, child); - if (msg != null) { + if(msg != null) { msg.append('\n').append(parent).append(" is parent of ").append(child); } } @@ -340,45 +301,43 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } } - if (msg != null) { + if(msg != null) { LOG.debugFine(msg.toString()); } - } /** * Returns true, if the specified parent cluster is a parent of one child of * the children clusters. * - * @param distanceFunction the distance function for distance computation - * between the clusters + * @param npred Neighborhood predicate * @param parent the parent to be tested * @param iter the list of children to be tested * @return true, if the specified parent cluster is a parent of one child of * the children clusters, false otherwise */ - private boolean isParent(ERiCDistanceFunction distanceFunction, Cluster<CorrelationModel<V>> parent, Iter<Cluster<CorrelationModel<V>>> iter) { + private boolean isParent(ERiCNeighborPredicate<V>.Instance npred, Cluster<CorrelationModel<V>> parent, Iter<Cluster<CorrelationModel<V>>> iter) { StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null; - for (; iter.valid(); iter.advance()) { + for(; iter.valid(); iter.advance()) { Cluster<CorrelationModel<V>> child = iter.get(); - if (parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) { + if(parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) { return false; } - BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult()); - if (msg != null) { + boolean dist = npred.weakNeighbors(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult()); + if(msg != null) { msg.append("\ndist(").append(child).append(" - ").append(parent).append(") = ").append(dist); } - if (!dist.bitValue()) { - if (msg != null) { + if(dist) { + if(msg != null) { LOG.debugFine(msg); } return true; } } - if (msg != null) { + if(msg != null) { LOG.debugFine(msg.toString()); } return false; @@ -395,28 +354,184 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } /** + * Class to wrap the ERiC settings. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Settings { + /** + * Neighborhood size. + */ + public int k; + + /** + * Class to compute PCA. + */ + public PCAFilteredRunner pca; + + /** + * Parameter to specify the threshold for approximate linear dependency: the + * strong eigenvectors of q are approximately linear dependent from the + * strong eigenvectors p if the following condition holds for all strong + * eigenvectors q_i of q (lambda_q < lambda_p): q_i' * M^check_p * q_i <= + * delta^2, must be a double equal to or greater than 0. + */ + public double delta; + + /** + * Parameter to specify the threshold for the maximum distance between two + * approximately linear dependent subspaces of two objects p and q (lambda_q + * < lambda_p) before considering them as parallel, must be a double equal + * to or greater than 0. + */ + public double tau; + + /** + * Minimum neighborhood size (density). + */ + public int minpts; + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Size for the kNN neighborhood used in the PCA step of ERiC. + */ + public static final OptionID K_ID = new OptionID("eric.k", "Number of neighbors to use for PCA."); + + /** + * Parameter to specify the threshold for approximate linear dependency: + * the strong eigenvectors of q are approximately linear dependent from + * the strong eigenvectors p if the following condition holds for all + * strong eigenvectors q_i of q (lambda_q < lambda_p): q_i' * M^check_p * + * q_i <= delta^2, must be a double equal to or greater than 0. + * <p> + * Default value: {@code 0.1} + * </p> + * <p> + * Key: {@code -ericdf.delta} + * </p> + */ + public static final OptionID DELTA_ID = new OptionID("ericdf.delta", "Threshold for approximate linear dependency: " + "the strong eigenvectors of q are approximately linear dependent " + "from the strong eigenvectors p if the following condition " + "holds for all stroneg eigenvectors q_i of q (lambda_q < lambda_p): " + "q_i' * M^check_p * q_i <= delta^2."); + + /** + * Parameter to specify the threshold for the maximum distance between two + * approximately linear dependent subspaces of two objects p and q + * (lambda_q < lambda_p) before considering them as parallel, must be a + * double equal to or greater than 0. + * <p> + * Default value: {@code 0.1} + * </p> + * <p> + * Key: {@code -ericdf.tau} + * </p> + */ + public static final OptionID TAU_ID = new OptionID("ericdf.tau", "Threshold for the maximum distance between two approximately linear " + "dependent subspaces of two objects p and q " + "(lambda_q < lambda_p) before considering them as parallel."); + + /** + * Settings to build. + */ + Settings settings; + + @Override + public void makeOptions(Parameterization config) { + settings = new Settings(); + configK(config); + // TODO: allow using other PCA runners? + settings.pca = config.tryInstantiate(PCAFilteredRunner.class); + configDelta(config); + configTau(config); + configMinPts(config); + } + + /** + * Configure the kNN parameter. + * + * @param config Parameter source + */ + protected void configK(Parameterization config) { + IntParameter kP = new IntParameter(K_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { + settings.k = kP.intValue(); + } + } + + /** + * Configure the delta parameter. + * + * @param config Parameter source + */ + protected void configDelta(Parameterization config) { + final DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.1); + deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(deltaP)) { + settings.delta = deltaP.doubleValue(); + } + } + + /** + * Configure the tau parameter. + * + * @param config Parameter source + */ + protected void configTau(Parameterization config) { + final DoubleParameter tauP = new DoubleParameter(TAU_ID, 0.1); + tauP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(tauP)) { + settings.tau = tauP.doubleValue(); + } + } + + /** + * Configure the minPts aka "mu" parameter. + * + * @param config Parameter source + */ + protected void configMinPts(Parameterization config) { + IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minptsP)) { + settings.minpts = minptsP.intValue(); + } + } + + @Override + public Settings makeInstance() { + return settings; + } + } + } + + /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { /** - * The COPAC instance to use + * The settings to use. */ - protected COPAC<V, IntegerDistance> copac; + protected ERiC.Settings settings; - @SuppressWarnings("unchecked") @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - copac = config.tryInstantiate(COPAC.class); + settings = config.tryInstantiate(ERiC.Settings.class); } @Override protected ERiC<V> makeInstance() { - return new ERiC<>(copac); + return new ERiC<>(settings); } } } |