summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
diff options
context:
space:
mode:
authorErich Schubert <erich@debian.org>2014-10-31 03:43:51 +0100
committerAndrej Shadura <andrewsh@debian.org>2019-03-09 22:30:40 +0000
commit596d8876dca5627dd76e8c23bf40a24cc305eeed (patch)
treed269ddb46561469f6b1fff67b19e0cd2b4608f5b /src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
parentee31d687b1a0e2f2f1e6e71375c7cc3b094919b8 (diff)
parent337087b668d3a54f3afee3a9adb597a32e9f7e94 (diff)
Import Debian changes 0.6.5~20141030-1
elki (0.6.5~20141030-1) unstable; urgency=medium * New upstream beta release * Urgency medium: 0.6.0 suffers from a performance issue with duplicates. * Repackaged tarball from .jar to .tar.bz2 * Add dependency on libsvm3-java * Enable line numbers for debugging (ant debuglevel)
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java393
1 files changed, 254 insertions, 139 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
index d535e136..87f7d519 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,50 +29,51 @@ import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.ERiCNeighborPredicate;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.CorrelationModel;
-import de.lmu.ifi.dbs.elki.data.model.DimensionModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.ERiCDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.BitDistance;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner;
-import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.StandardCovarianceMatrixBuilder;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.ParameterException;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
* Performs correlation clustering on the data partitioned according to local
* correlation dimensionality and builds a hierarchy of correlation clusters
* that allows multiple inheritance from the clustering result.
+ *
+ * Reference:
* <p>
- * Reference: E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek: On
- * Exploring Complex Relationships of Correlation Clusters. <br>
+ * E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek:<br />
+ * On Exploring Complex Relationships of Correlation Clusters.<br />
* In Proc. 19th International Conference on Scientific and Statistical Database
* Management (SSDBM 2007), Banff, Canada, 2007.
* </p>
@@ -81,36 +82,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz
*
* @apiviz.composedOf COPAC
* @apiviz.composedOf DBSCAN
- * @apiviz.composedOf ERiCDistanceFunction
* @apiviz.composedOf FirstNEigenPairFilter
* @apiviz.composedOf PCAFilteredRunner
+ * @apiviz.composedOf ERiCNeighborPredicate
* @apiviz.has CorrelationModel
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
-// TODO: Re-use PCARunner objects somehow?
@Title("ERiC: Exploring Relationships among Correlation Clusters")
-@Description("Performs the DBSCAN algorithm on the data using a special distance function taking into account correlations among attributes and builds " + "a hierarchy that allows multiple inheritance from the correlation clustering result.")
-@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek", title = "On Exploring Complex Relationships of Correlation Clusters", booktitle = "Proc. 19th International Conference on Scientific and Statistical Database Management (SSDBM 2007), Banff, Canada, 2007", url = "http://dx.doi.org/10.1109/SSDBM.2007.21")
-public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<CorrelationModel<V>>> implements ClusteringAlgorithm<Clustering<CorrelationModel<V>>> {
+@Description("Performs the DBSCAN algorithm on the data using a special distance function taking into account correlations among attributes and builds " //
+ + "a hierarchy that allows multiple inheritance from the correlation clustering result.")
+@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek",//
+title = "On Exploring Complex Relationships of Correlation Clusters", //
+booktitle = "Proc. 19th International Conference on Scientific and Statistical Database Management (SSDBM 2007), Banff, Canada, 2007", //
+url = "http://dx.doi.org/10.1109/SSDBM.2007.21")
+public class ERiC<V extends NumberVector> extends AbstractAlgorithm<Clustering<CorrelationModel<V>>> implements ClusteringAlgorithm<Clustering<CorrelationModel<V>>> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(ERiC.class);
/**
- * The COPAC clustering algorithm.
+ * ERiC Settings.
*/
- private COPAC<V, IntegerDistance> copacAlgorithm;
+ private ERiC.Settings settings;
/**
* Constructor.
*
- * @param copacAlgorithm COPAC to use
+ * @param settings ERiC clustering settings
*/
- public ERiC(COPAC<V, IntegerDistance> copacAlgorithm) {
+ public ERiC(ERiC.Settings settings) {
super();
- this.copacAlgorithm = copacAlgorithm;
+ this.settings = settings;
}
/**
@@ -119,30 +123,27 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
* @param relation Relation to process
* @return Clustering result
*/
- public Clustering<CorrelationModel<V>> run(Relation<V> relation) {
+ public Clustering<CorrelationModel<V>> run(Database database, Relation<V> relation) {
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
- // run COPAC
- if (stepprog != null) {
- stepprog.beginStep(1, "Preprocessing local correlation dimensionalities and partitioning data", LOG);
- }
- Clustering<Model> copacResult = copacAlgorithm.run(relation);
-
- DistanceQuery<V, IntegerDistance> query = copacAlgorithm.getPartitionDistanceQuery();
+ // Run Generalized DBSCAN
+ LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data");
+ // FIXME: how to ensure we are running on the same relation?
+ ERiCNeighborPredicate<V>.Instance npred = new ERiCNeighborPredicate<V>(settings).instantiate(database, relation);
+ CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database, TypeUtil.DBIDS);
+ Clustering<Model> copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
// extract correlation clusters
- if (stepprog != null) {
- stepprog.beginStep(2, "Extract correlation clusters", LOG);
- }
- List<List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality);
- if (LOG.isDebugging()) {
+ LOG.beginStep(stepprog, 2, "Extract correlation clusters");
+ List<List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred);
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters...");
- for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
+ for(int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim);
msg.append("\n\ncorrDim ").append(corrDim);
- for (Cluster<CorrelationModel<V>> cluster : correlationClusters) {
+ for(Cluster<CorrelationModel<V>> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex());
// msg.append("\n basis " +
@@ -152,42 +153,38 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
LOG.debugFine(msg.toString());
}
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
int clusters = 0;
- for (List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap) {
+ for(List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap) {
clusters += correlationClusters.size();
}
LOG.verbose(clusters + " clusters extracted.");
}
// build hierarchy
- if (stepprog != null) {
- stepprog.beginStep(3, "Building hierarchy", LOG);
- }
+ LOG.beginStep(stepprog, 3, "Building hierarchy");
Clustering<CorrelationModel<V>> clustering = new Clustering<>("ERiC clustering", "eric-clustering");
- buildHierarchy(clustering, clusterMap, query);
- if (LOG.isDebugging()) {
+ buildHierarchy(clustering, clusterMap, npred);
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 3: Build hierarchy");
- for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
+ for(int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim);
- for (Cluster<CorrelationModel<V>> cluster : correlationClusters) {
+ for(Cluster<CorrelationModel<V>> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex());
- for (Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
- for (Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
}
LOG.debugFine(msg.toString());
}
- if (stepprog != null) {
- stepprog.setCompleted(LOG);
- }
+ LOG.setCompleted(stepprog);
- for (Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.size() - 1)) {
+ for(Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.size() - 1)) {
clustering.addToplevelCluster(rc);
}
return clustering;
@@ -199,16 +196,17 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
* correlation dimension. Each cluster is defined by the basis vectors
* defining the subspace in which the cluster appears.
*
- * @param copacResult
+ * @param dbscanResult
*
* @param database the database containing the objects
* @param dimensionality the dimensionality of the feature space
+ * @param npred ERiC predicate
* @return a list of clusters for each dimensionality
*/
- private List<List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> copacResult, Relation<V> database, int dimensionality) {
+ private List<List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> dbscanResult, Relation<V> database, int dimensionality, ERiCNeighborPredicate<V>.Instance npred) {
// result
List<List<Cluster<CorrelationModel<V>>>> clusterMap = new ArrayList<>();
- for (int i = 0; i <= dimensionality; i++) {
+ for(int i = 0; i <= dimensionality; i++) {
clusterMap.add(new ArrayList<Cluster<CorrelationModel<V>>>());
}
@@ -216,47 +214,38 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
Cluster<Model> noise = null;
// iterate over correlation dimensions
- for (Cluster<Model> clus : copacResult.getAllClusters()) {
+ for(Cluster<Model> clus : dbscanResult.getAllClusters()) {
DBIDs group = clus.getIDs();
- if (clus.getModel() != null && clus.getModel() instanceof DimensionModel) {
- int correlationDimension = ((DimensionModel) clus.getModel()).getDimension();
+ int dim = clus.isNoise() ? dimensionality : npred.dimensionality(clus.getIDs().iter());
- ListParameterization parameters = pcaParameters(correlationDimension);
- Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class);
- PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls);
- parameters.failOnErrors();
+ if(dim < dimensionality) {
+ PCAFilteredRunner pca = new PCAFilteredRunner(new StandardCovarianceMatrixBuilder(), new FirstNEigenPairFilter(dim), 1., 0.);
// get cluster list for this dimension.
- List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(correlationDimension);
+ List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(dim);
PCAFilteredResult pcares = pca.processIds(group, database);
V centroid = Centroid.make(database, group).toVector(database);
- Cluster<CorrelationModel<V>> correlationCluster = new Cluster<>("[" + correlationDimension + "_" + correlationClusters.size() + "]", group, new CorrelationModel<>(pcares, centroid));
+ Cluster<CorrelationModel<V>> correlationCluster = new Cluster<>("[" + dim + "_" + correlationClusters.size() + "]", group, new CorrelationModel<>(pcares, centroid));
correlationClusters.add(correlationCluster);
}
// partition containing noise
- else if (clus.getModel() != null && clus.isNoise()) {
- if (noise == null) {
+ else {
+ if(noise == null) {
noise = clus;
- } else {
+ }
+ else {
ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
merged.addDBIDs(clus.getIDs());
noise.setIDs(merged);
}
- } else {
- throw new IllegalStateException("Unexpected group returned: " + clus.getClass().getName());
}
}
- if (noise != null && noise.size() > 0) {
+ if(noise != null && noise.size() > 0) {
// get cluster list for this dimension.
List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(dimensionality);
- ListParameterization parameters = pcaParameters(dimensionality);
- Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class);
- PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls);
- for (ParameterException e : parameters.getErrors()) {
- LOG.warning("Error in internal parameterization: " + e.getMessage());
- }
+ PCAFilteredRunner pca = new PCAFilteredRunner(new StandardCovarianceMatrixBuilder(), new FirstNEigenPairFilter(dimensionality), 1., 0.);
PCAFilteredResult pcares = pca.processIds(noise.getIDs(), database);
V centroid = Centroid.make(database, noise.getIDs()).toVector(database);
@@ -265,8 +254,8 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
// Delete dimensionalities not found.
- for (int i = dimensionality; i > 0; i--) {
- if (clusterMap.get(i).size() > 0) {
+ for(int i = dimensionality; i > 0; i--) {
+ if(clusterMap.get(i).size() > 0) {
break;
}
clusterMap.remove(i);
@@ -275,63 +264,35 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
return clusterMap;
}
- /**
- * Returns the parameters for the PCA for the specified correlation dimension.
- *
- * @param correlationDimension the correlation dimension
- * @return the parameters for the PCA for the specified correlation dimension
- */
- private ListParameterization pcaParameters(int correlationDimension) {
- ListParameterization parameters = new ListParameterization();
-
- parameters.addParameter(PCAFilteredRunner.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class);
- parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, correlationDimension);
-
- return parameters;
- }
-
- private void buildHierarchy(Clustering<CorrelationModel<V>> clustering, List<List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) {
+ private void buildHierarchy(Clustering<CorrelationModel<V>> clustering, List<List<Cluster<CorrelationModel<V>>>> clusterMap, ERiCNeighborPredicate<V>.Instance npred) {
StringBuilder msg = LOG.isDebuggingFine() ? new StringBuilder() : null;
Hierarchy<Cluster<CorrelationModel<V>>> hier = clustering.getClusterHierarchy();
- DBSCAN<V, DoubleDistance> dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query));
- if (dbscan == null) {
- // TODO: appropriate exception class?
- throw new IllegalArgumentException("ERiC was run without DBSCAN as COPAC algorithm!");
- }
- DistanceFunction<? super V, ?> dfun = ProxyDistanceFunction.unwrapDistance(dbscan.getDistanceFunction());
- ERiCDistanceFunction distanceFunction = ClassGenericsUtil.castWithGenericsOrNull(ERiCDistanceFunction.class, dfun);
- if (distanceFunction == null) {
- // TODO: appropriate exception class?
- throw new IllegalArgumentException("ERiC was run without ERiCDistanceFunction as distance function: got " + dfun.getClass());
- }
// Find maximum dimensionality found:
int lambda_max = clusterMap.size() - 1;
- for (int childCorrDim = 0; childCorrDim < lambda_max; childCorrDim++) {
+ for(int childCorrDim = 0; childCorrDim < lambda_max; childCorrDim++) {
List<Cluster<CorrelationModel<V>>> children = clusterMap.get(childCorrDim);
- // SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> parentMap =
- // clusterMap.tailMap(childCorrDim + 1);
- if (msg != null) {
+ if(msg != null) {
msg.append("\ncorrdim ").append(childCorrDim);
- // msg.append("\nparents ").append(parentMap.keySet());
}
- for (Cluster<CorrelationModel<V>> child : children) {
- for (int parentCorrDim = childCorrDim + 1; parentCorrDim <= lambda_max; parentCorrDim++) {
+ for(Cluster<CorrelationModel<V>> child : children) {
+ for(int parentCorrDim = childCorrDim + 1; parentCorrDim <= lambda_max; parentCorrDim++) {
List<Cluster<CorrelationModel<V>>> parents = clusterMap.get(parentCorrDim);
- for (Cluster<CorrelationModel<V>> parent : parents) {
+ for(Cluster<CorrelationModel<V>> parent : parents) {
int subspaceDim_parent = parent.getModel().getPCAResult().getCorrelationDimension();
- if (subspaceDim_parent == lambda_max && hier.numParents(child) == 0) {
+ if(subspaceDim_parent == lambda_max && hier.numParents(child) == 0) {
clustering.addChildCluster(parent, child);
- if (msg != null) {
+ if(msg != null) {
msg.append('\n').append(parent).append(" is parent of ").append(child);
}
- } else {
- BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
- if (!dist.bitValue() && (hier.numParents(child) == 0 || !isParent(distanceFunction, parent, hier.iterParents(child)))) {
+ }
+ else {
+ boolean dist = npred.weakNeighbors(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
+ if(!dist && (hier.numParents(child) == 0 || !isParent(npred, parent, hier.iterParents(child)))) {
clustering.addChildCluster(parent, child);
- if (msg != null) {
+ if(msg != null) {
msg.append('\n').append(parent).append(" is parent of ").append(child);
}
}
@@ -340,45 +301,43 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
}
- if (msg != null) {
+ if(msg != null) {
LOG.debugFine(msg.toString());
}
-
}
/**
* Returns true, if the specified parent cluster is a parent of one child of
* the children clusters.
*
- * @param distanceFunction the distance function for distance computation
- * between the clusters
+ * @param npred Neighborhood predicate
* @param parent the parent to be tested
* @param iter the list of children to be tested
* @return true, if the specified parent cluster is a parent of one child of
* the children clusters, false otherwise
*/
- private boolean isParent(ERiCDistanceFunction distanceFunction, Cluster<CorrelationModel<V>> parent, Iter<Cluster<CorrelationModel<V>>> iter) {
+ private boolean isParent(ERiCNeighborPredicate<V>.Instance npred, Cluster<CorrelationModel<V>> parent, Iter<Cluster<CorrelationModel<V>>> iter) {
StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
- for (; iter.valid(); iter.advance()) {
+ for(; iter.valid(); iter.advance()) {
Cluster<CorrelationModel<V>> child = iter.get();
- if (parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) {
+ if(parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) {
return false;
}
- BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
- if (msg != null) {
+ boolean dist = npred.weakNeighbors(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
+ if(msg != null) {
msg.append("\ndist(").append(child).append(" - ").append(parent).append(") = ").append(dist);
}
- if (!dist.bitValue()) {
- if (msg != null) {
+ if(dist) {
+ if(msg != null) {
LOG.debugFine(msg);
}
return true;
}
}
- if (msg != null) {
+ if(msg != null) {
LOG.debugFine(msg.toString());
}
return false;
@@ -395,28 +354,184 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
/**
+ * Class to wrap the ERiC settings.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Settings {
+ /**
+ * Neighborhood size.
+ */
+ public int k;
+
+ /**
+ * Class to compute PCA.
+ */
+ public PCAFilteredRunner pca;
+
+ /**
+ * Parameter to specify the threshold for approximate linear dependency: the
+ * strong eigenvectors of q are approximately linear dependent from the
+ * strong eigenvectors p if the following condition holds for all strong
+ * eigenvectors q_i of q (lambda_q < lambda_p): q_i' * M^check_p * q_i <=
+ * delta^2, must be a double equal to or greater than 0.
+ */
+ public double delta;
+
+ /**
+ * Parameter to specify the threshold for the maximum distance between two
+ * approximately linear dependent subspaces of two objects p and q (lambda_q
+ * < lambda_p) before considering them as parallel, must be a double equal
+ * to or greater than 0.
+ */
+ public double tau;
+
+ /**
+ * Minimum neighborhood size (density).
+ */
+ public int minpts;
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Size for the kNN neighborhood used in the PCA step of ERiC.
+ */
+ public static final OptionID K_ID = new OptionID("eric.k", "Number of neighbors to use for PCA.");
+
+ /**
+ * Parameter to specify the threshold for approximate linear dependency:
+ * the strong eigenvectors of q are approximately linear dependent from
+ * the strong eigenvectors p if the following condition holds for all
+ * strong eigenvectors q_i of q (lambda_q < lambda_p): q_i' * M^check_p *
+ * q_i <= delta^2, must be a double equal to or greater than 0.
+ * <p>
+ * Default value: {@code 0.1}
+ * </p>
+ * <p>
+ * Key: {@code -ericdf.delta}
+ * </p>
+ */
+ public static final OptionID DELTA_ID = new OptionID("ericdf.delta", "Threshold for approximate linear dependency: " + "the strong eigenvectors of q are approximately linear dependent " + "from the strong eigenvectors p if the following condition " + "holds for all stroneg eigenvectors q_i of q (lambda_q < lambda_p): " + "q_i' * M^check_p * q_i <= delta^2.");
+
+ /**
+ * Parameter to specify the threshold for the maximum distance between two
+ * approximately linear dependent subspaces of two objects p and q
+ * (lambda_q < lambda_p) before considering them as parallel, must be a
+ * double equal to or greater than 0.
+ * <p>
+ * Default value: {@code 0.1}
+ * </p>
+ * <p>
+ * Key: {@code -ericdf.tau}
+ * </p>
+ */
+ public static final OptionID TAU_ID = new OptionID("ericdf.tau", "Threshold for the maximum distance between two approximately linear " + "dependent subspaces of two objects p and q " + "(lambda_q < lambda_p) before considering them as parallel.");
+
+ /**
+ * Settings to build.
+ */
+ Settings settings;
+
+ @Override
+ public void makeOptions(Parameterization config) {
+ settings = new Settings();
+ configK(config);
+ // TODO: allow using other PCA runners?
+ settings.pca = config.tryInstantiate(PCAFilteredRunner.class);
+ configDelta(config);
+ configTau(config);
+ configMinPts(config);
+ }
+
+ /**
+ * Configure the kNN parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configK(Parameterization config) {
+ IntParameter kP = new IntParameter(K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ settings.k = kP.intValue();
+ }
+ }
+
+ /**
+ * Configure the delta parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configDelta(Parameterization config) {
+ final DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.1);
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(deltaP)) {
+ settings.delta = deltaP.doubleValue();
+ }
+ }
+
+ /**
+ * Configure the tau parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configTau(Parameterization config) {
+ final DoubleParameter tauP = new DoubleParameter(TAU_ID, 0.1);
+ tauP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(tauP)) {
+ settings.tau = tauP.doubleValue();
+ }
+ }
+
+ /**
+ * Configure the minPts aka "mu" parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configMinPts(Parameterization config) {
+ IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minptsP)) {
+ settings.minpts = minptsP.intValue();
+ }
+ }
+
+ @Override
+ public Settings makeInstance() {
+ return settings;
+ }
+ }
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
- * The COPAC instance to use
+ * The settings to use.
*/
- protected COPAC<V, IntegerDistance> copac;
+ protected ERiC.Settings settings;
- @SuppressWarnings("unchecked")
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- copac = config.tryInstantiate(COPAC.class);
+ settings = config.tryInstantiate(ERiC.Settings.class);
}
@Override
protected ERiC<V> makeInstance() {
- return new ERiC<>(copac);
+ return new ERiC<>(settings);
}
}
}