summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java24
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java151
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java681
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java280
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java135
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java546
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java225
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java154
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java132
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java24
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java46
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java37
14 files changed, 1560 insertions, 881 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
index 617d74cd..f55ec964 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -35,9 +35,9 @@ import java.util.TreeMap;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace;
import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEUnit;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEInterval;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
-import de.lmu.ifi.dbs.elki.data.Interval;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.Subspace;
import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
@@ -95,7 +95,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications")
@Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.")
@Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314")
-public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+public class CLIQUE<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
@@ -165,7 +165,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
* @param relation Data relation to process
* @return Clustering result
*/
- public Clustering<SubspaceModel<V>> run(Relation<V> relation) {
+ public Clustering<SubspaceModel> run(Relation<V> relation) {
// 1. Identification of subspaces that contain clusters
// TODO: use step logging.
if(LOG.isVerbose()) {
@@ -203,7 +203,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
}
// build result
int numClusters = 1;
- Clustering<SubspaceModel<V>> result = new Clustering<>("CLIQUE clustering", "clique-clustering");
+ Clustering<SubspaceModel> result = new Clustering<>("CLIQUE clustering", "clique-clustering");
for(Integer dim : dimensionToDenseSubspaces.keySet()) {
List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim);
List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces);
@@ -213,8 +213,8 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
}
for(Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) {
- Cluster<SubspaceModel<V>> newCluster = new Cluster<>(modelAndCluster.second);
- newCluster.setModel(new SubspaceModel<>(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).toVector(relation)));
+ Cluster<SubspaceModel> newCluster = new Cluster<>(modelAndCluster.second);
+ newCluster.setModel(new SubspaceModel(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second)));
newCluster.setName("cluster_" + numClusters++);
result.addToplevelCluster(newCluster);
}
@@ -313,9 +313,9 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
if(LOG.isDebuggingFiner()) {
StringBuilder msg = new StringBuilder();
- msg.append(" minima: ").append(FormatUtil.format(minima, ", ", 2));
- msg.append("\n maxima: ").append(FormatUtil.format(maxima, ", ", 2));
- msg.append("\n unit lengths: ").append(FormatUtil.format(unit_lengths, ", ", 2));
+ msg.append(" minima: ").append(FormatUtil.format(minima, ", ", FormatUtil.NF2));
+ msg.append("\n maxima: ").append(FormatUtil.format(maxima, ", ", FormatUtil.NF2));
+ msg.append("\n unit lengths: ").append(FormatUtil.format(unit_lengths, ", ", FormatUtil.NF2));
LOG.debugFiner(msg.toString());
}
@@ -341,7 +341,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
List<CLIQUEUnit<V>> units = new ArrayList<>((xsi * dimensionality));
for(int x = 0; x < xsi; x++) {
for(int d = 0; d < dimensionality; d++) {
- units.add(new CLIQUEUnit<V>(new Interval(d, unit_bounds[x][d], unit_bounds[x + 1][d])));
+ units.add(new CLIQUEUnit<V>(new CLIQUEInterval(d, unit_bounds[x][d], unit_bounds[x + 1][d])));
}
}
@@ -582,7 +582,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
protected int xsi;
protected double tau;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
index 5f798a66..fa8bbbe4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,7 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.BitSet;
import java.util.Random;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
@@ -47,12 +46,12 @@ import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceMaximumDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -66,14 +65,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
/**
* <p>
- * Provides the DOC algorithm, and it's heuristic variant, FastDOC. DOC is a
- * sampling based subspace clustering algorithm.
+ * The DOC algorithm, and it's heuristic variant, FastDOC. DOC is a sampling
+ * based subspace clustering algorithm.
* </p>
*
* <p>
* Reference: <br/>
* C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali<br />
- * A Monte Carlo algorithm for fast projective clustering. <br/>
+ * A Monte Carlo algorithm for fast projective clustering. <br />
* In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02).
* </p>
*
@@ -84,8 +83,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
* @param <V> the type of NumberVector handled by this Algorithm.
*/
@Title("DOC: Density-based Optimal projective Clustering")
-@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", title = "A Monte Carlo algorithm for fast projective clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", url = "http://dx.doi.org/10.1145/564691.564739")
-public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", //
+title = "A Monte Carlo algorithm for fast projective clustering", //
+booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", //
+url = "http://dx.doi.org/10.1145/564691.564739")
+public class DOC<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
@@ -152,7 +154,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param database Database
* @param relation Data relation
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
// Dimensionality of our set.
final int d = RelationUtil.dimensionality(relation);
@@ -173,7 +175,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
int minClusterSize = (int) (alpha * S.size());
// List of all clusters we found.
- Clustering<SubspaceModel<V>> result = new Clustering<>("DOC Clusters", "DOC");
+ Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");
// Inform the user about the number of actual clusters found so far.
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
@@ -181,7 +183,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
// To not only find a single cluster, we continue running until our set
// of points is empty.
while(S.size() > minClusterSize) {
- Cluster<SubspaceModel<V>> C;
+ Cluster<SubspaceModel> C;
if(heuristics) {
C = runFastDOC(relation, S, d, n, m, (int) r);
}
@@ -206,15 +208,10 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
// Add the remainder as noise.
if(S.size() > 0) {
- BitSet alldims = new BitSet();
- alldims.set(0, d);
- result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel<>(new Subspace(alldims), Centroid.make(relation, S).toVector(relation))));
+ long[] alldims = BitsUtil.ones(d);
+ result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S))));
}
-
- if(cprogress != null) {
- cprogress.setCompleted(LOG);
- }
-
+ LOG.setCompleted(cprogress);
return result;
}
@@ -230,12 +227,11 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param minClusterSize Minimum size a cluster must have to be accepted.
* @return a cluster, if one is found, else <code>null</code>.
*/
- private Cluster<SubspaceModel<V>> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
- final DoubleDistance wd = new DoubleDistance(w);
+ private Cluster<SubspaceModel> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
// Best cluster for the current run.
DBIDs C = null;
// Relevant attributes for the best cluster.
- BitSet D = null;
+ long[] D = null;
// Quality of the best cluster.
double quality = Double.NEGATIVE_INFINITY;
@@ -244,9 +240,9 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
// double[d], new double[d]);
// Weights for distance (= rectangle query)
- SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(new BitSet(d));
- DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
- RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq);
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d));
+ DistanceQuery<V> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V> rq = relation.getDatabase().getRangeQuery(dq);
// Inform the user about the progress in the current iteration.
FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
@@ -263,22 +259,22 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
// Initialize cluster info.
- BitSet nD = new BitSet(d);
+ long[] nD = BitsUtil.zero(d);
// Test each dimension and build bounding box.
for(int k = 0; k < d; ++k) {
if(dimensionIsRelevant(k, relation, randomSet)) {
- nD.set(k);
+ BitsUtil.setI(nD, k);
}
}
- if(nD.cardinality() > 0) {
+ if(BitsUtil.cardinality(nD) > 0) {
// Get all points in the box.
df.setSelectedDimensions(nD);
// TODO: add filtering capabilities into query API!
- DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, wd));
+ DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w));
if(LOG.isDebuggingFiner()) {
- LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + nD.cardinality());
+ LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD));
}
// Is the cluster large enough?
@@ -290,7 +286,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
else {
// Better cluster than before?
- double nQuality = computeClusterQuality(nC.size(), nD.cardinality());
+ double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD));
if(nQuality > quality) {
if(LOG.isDebuggingFiner()) {
LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
@@ -306,23 +302,12 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
}
}
-
- if(iprogress != null) {
- iprogress.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(iprogress);
}
}
+ LOG.ensureCompleted(iprogress);
- if(iprogress != null) {
- iprogress.ensureCompleted(LOG);
- }
-
- if(C != null) {
- return makeCluster(relation, C, D);
- }
- else {
- return null;
- }
+ return (C != null) ? makeCluster(relation, C, D) : null;
}
/**
@@ -336,9 +321,9 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param n Number of outer iterations (seed points).
* @return a cluster, if one is found, else <code>null</code>.
*/
- private Cluster<SubspaceModel<V>> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
+ private Cluster<SubspaceModel> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
// Relevant attributes of highest cardinality.
- BitSet D = null;
+ long[] D = null;
// The seed point for the best dimensions.
DBIDVar dV = DBIDUtil.newVar();
@@ -357,57 +342,46 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
// Initialize cluster info.
- BitSet nD = new BitSet(d);
+ long[] nD = BitsUtil.zero(d);
// Test each dimension.
for(int k = 0; k < d; ++k) {
if(dimensionIsRelevant(k, relation, randomSet)) {
- nD.set(k);
+ BitsUtil.setI(nD, k);
}
}
- if(D == null || nD.cardinality() > D.cardinality()) {
+ if(D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) {
D = nD;
dV.set(iter);
- if(D.cardinality() >= d_zero) {
+ if(BitsUtil.cardinality(D) >= d_zero) {
if(iprogress != null) {
iprogress.setProcessed(iprogress.getTotal(), LOG);
}
break outer;
}
}
-
- if(iprogress != null) {
- iprogress.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(iprogress);
}
}
-
- if(iprogress != null) {
- iprogress.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(iprogress);
// If no relevant dimensions were found, skip it.
- if(D == null || D.cardinality() == 0) {
+ if(D == null || BitsUtil.cardinality(D) == 0) {
return null;
}
// Get all points in the box.
SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
- DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
- RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
+ DistanceQuery<V> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
// TODO: add filtering capabilities into query API!
- DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, new DoubleDistance(w)));
+ DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w));
// If we have a non-empty cluster, return it.
- if(C.size() > 0) {
- return makeCluster(relation, C, D);
- }
- else {
- return null;
- }
+ return (C.size() > 0) ? makeCluster(relation, C, D) : null;
}
/**
@@ -421,12 +395,11 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @return <code>true</code> if the dimension is relevant.
*/
private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
- double min = Double.POSITIVE_INFINITY;
- double max = Double.NEGATIVE_INFINITY;
+ double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY;
for(DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
- V xV = relation.get(iter);
- min = Math.min(min, xV.doubleValue(dimension));
- max = Math.max(max, xV.doubleValue(dimension));
+ double xV = relation.get(iter).doubleValue(dimension);
+ min = (xV < min) ? xV : min;
+ max = (xV > max) ? xV : max;
if(max - min > w) {
return false;
}
@@ -443,10 +416,10 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param D the relevant dimensions.
* @return an object representing the subspace cluster.
*/
- private Cluster<SubspaceModel<V>> makeCluster(Relation<V> relation, DBIDs C, BitSet D) {
+ private Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
- Cluster<SubspaceModel<V>> cluster = new Cluster<>(ids);
- cluster.setModel(new SubspaceModel<>(new Subspace(D), Centroid.make(relation, ids).toVector(relation)));
+ Cluster<SubspaceModel> cluster = new Cluster<>(ids);
+ cluster.setModel(new SubspaceModel(new Subspace(D), Centroid.make(relation, ids)));
return cluster;
}
@@ -483,7 +456,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Relative density threshold parameter Alpha.
*/
@@ -549,26 +522,26 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
super.makeOptions(config);
{
- DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2);
- param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
- param.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE) //
+ .addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
if(config.grab(param)) {
alpha = param.getValue();
}
}
{
- DoubleParameter param = new DoubleParameter(BETA_ID, 0.8);
- param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
- param.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ DoubleParameter param = new DoubleParameter(BETA_ID, 0.8) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE) //
+ .addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
if(config.grab(param)) {
beta = param.getValue();
}
}
{
- DoubleParameter param = new DoubleParameter(W_ID, 0.05);
- param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ DoubleParameter param = new DoubleParameter(W_ID, 0.05) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(param)) {
w = param.getValue();
}
@@ -582,8 +555,8 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
if(heuristics) {
- IntParameter param = new IntParameter(D_ZERO_ID, 5);
- param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ IntParameter param = new IntParameter(D_ZERO_ID, 5) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(param)) {
d_zero = param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
index cd5e51b8..20849dd6 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,8 +23,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import gnu.trove.map.hash.TCustomHashMap;
+import gnu.trove.procedure.TObjectObjectProcedure;
+
import java.util.ArrayList;
-import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
@@ -34,7 +36,9 @@ import java.util.List;
import java.util.Map;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.ClusterOrderResult;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.CorrelationClusterOrderEntry;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.GeneralizedOPTICS;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -42,26 +46,20 @@ import de.lmu.ifi.dbs.elki.data.Subspace;
import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DiSHDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.AbstractDistance;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelationDistance;
import de.lmu.ifi.dbs.elki.index.preprocessed.preference.DiSHPreferenceVectorIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
import de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid;
-import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry;
-import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
-import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
@@ -73,7 +71,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@@ -93,7 +90,6 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
* @author Elke Achtert
*
* @apiviz.uses DiSHPreferenceVectorIndex
- * @apiviz.uses DiSHDistanceFunction
* @apiviz.has SubspaceModel
*
* @param <V> the type of NumberVector handled by this Algorithm
@@ -101,94 +97,61 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@Title("DiSH: Detecting Subspace cluster Hierarchies")
@Description("Algorithm to find hierarchical correlation clusters in subspaces.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15")
-public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+public class DiSH<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(DiSH.class);
/**
- * Parameter that specifies the maximum radius of the neighborhood to be
- * considered in each dimension for determination of the preference vector,
- * must be a double equal to or greater than 0.
- * <p>
- * Default value: {@code 0.001}
- * </p>
- * <p>
- * Key: {@code -dish.epsilon}
- * </p>
- */
- public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", "The maximum radius of the neighborhood " + "to be considered in each dimension for determination of " + "the preference vector.");
-
- /**
- * Parameter that specifies the a minimum number of points as a smoothing
- * factor to avoid the single-link-effect, must be an integer greater than 0.
- * <p>
- * Default value: {@code 1}
- * </p>
- * <p>
- * Key: {@code -dish.mu}
- * </p>
- */
- public static final OptionID MU_ID = new OptionID("dish.mu", "The minimum number of points as a smoothing factor to avoid the single-link-effekt.");
-
- /**
- * Holds the value of {@link #EPSILON_ID}.
+ * Holds the value of {@link Parameterizer#EPSILON_ID}.
*/
private double epsilon;
/**
- * The distance function we use
+ * The DiSH preprocessor.
*/
- private DiSHDistanceFunction dishDistance;
+ private DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor;
/**
- * Parameters that were given to OPTICS
+ * OPTICS minPts parameter.
*/
- private Collection<Pair<OptionID, Object>> opticsAlgorithmParameters;
+ private int mu;
/**
* Constructor.
*
* @param epsilon Epsilon value
- * @param dishDistance Distance function
- * @param opticsAlgorithmParameters OPTICS parameters
+ * @param mu Mu parameter (minPts)
+ * @param dishPreprocessor DiSH preprocessor
*/
- public DiSH(double epsilon, DiSHDistanceFunction dishDistance, Collection<Pair<OptionID, Object>> opticsAlgorithmParameters) {
+ public DiSH(double epsilon, int mu, DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor) {
super();
this.epsilon = epsilon;
- this.dishDistance = dishDistance;
- this.opticsAlgorithmParameters = opticsAlgorithmParameters;
+ this.mu = mu;
+ this.dishPreprocessor = dishPreprocessor;
}
/**
* Performs the DiSH algorithm on the given database.
*
- * @param database Database to process
* @param relation Relation to process
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
- // Instantiate DiSH distance (and thus run the preprocessor)
+ public Clustering<SubspaceModel> run(Relation<V> relation) {
if(LOG.isVerbose()) {
- LOG.verbose("*** Run DiSH preprocessor.");
+ LOG.verbose("Running the DiSH preprocessor.");
}
- DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation);
- // Configure and run OPTICS.
+ DiSHPreferenceVectorIndex<V> indexinst = dishPreprocessor.instantiate(relation);
if(LOG.isVerbose()) {
- LOG.verbose("*** Run OPTICS algorithm.");
+ LOG.verbose("Running the OPTICS algorithm.");
}
- ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters);
- opticsconfig.addParameter(OPTICS.DISTANCE_FUNCTION_ID, ProxyDistanceFunction.proxy(dishDistanceQuery));
- Class<OPTICS<V, PreferenceVectorBasedCorrelationDistance>> cls = ClassGenericsUtil.uglyCastIntoSubclass(OPTICS.class);
- OPTICS<V, PreferenceVectorBasedCorrelationDistance> optics = null;
- optics = opticsconfig.tryInstantiate(cls);
- ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation);
+ ClusterOrderResult<DiSHClusterOrderEntry> opticsResult = new DiSHOPTICS(indexinst).run(relation);
if(LOG.isVerbose()) {
- LOG.verbose("*** Compute Clusters.");
+ LOG.verbose("Compute Clusters.");
}
- return computeClusters(relation, opticsResult, dishDistanceQuery);
+ return computeClusters(relation, opticsResult);
}
/**
@@ -196,58 +159,68 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
*
* @param database the database holding the objects
* @param clusterOrder the cluster order
- * @param distFunc Distance function
*/
- private Clustering<SubspaceModel<V>> computeClusters(Relation<V> database, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder, DiSHDistanceFunction.Instance<V> distFunc) {
- int dimensionality = RelationUtil.dimensionality(database);
- int minpts = dishDistance.getMinpts();
+ private Clustering<SubspaceModel> computeClusters(Relation<V> database, ClusterOrderResult<DiSHClusterOrderEntry> clusterOrder) {
+ final int dimensionality = RelationUtil.dimensionality(database);
// extract clusters
- Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder);
+ TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = extractClusters(database, clusterOrder);
if(LOG.isVerbose()) {
- StringBuilder msg = new StringBuilder("Step 1: extract clusters");
- for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
- msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
+ final StringBuilder msg = new StringBuilder("Step 1: extract clusters\n");
+ clustersMap.forEachEntry(new TObjectObjectProcedure<long[], List<ArrayModifiableDBIDs>>() {
+ @Override
+ public boolean execute(long[] key, List<ArrayModifiableDBIDs> clusters) {
+ msg.append(BitsUtil.toStringLow(key, dimensionality)).append(" sizes:");
+ for(ArrayModifiableDBIDs c : clusters) {
+ msg.append(' ').append(c.size());
+ }
+ msg.append('\n');
+ return true; // continue
}
- }
+ });
LOG.verbose(msg.toString());
}
// check if there are clusters < minpts
- checkClusters(database, distFunc, clustersMap, minpts);
+ checkClusters(database, clustersMap);
if(LOG.isVerbose()) {
- StringBuilder msg = new StringBuilder("Step 2: check clusters");
- for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
- msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
+ final StringBuilder msg = new StringBuilder("Step 2: check clusters\n");
+ clustersMap.forEachEntry(new TObjectObjectProcedure<long[], List<ArrayModifiableDBIDs>>() {
+ @Override
+ public boolean execute(long[] key, List<ArrayModifiableDBIDs> clusters) {
+ msg.append(BitsUtil.toStringLow(key, dimensionality)).append(" sizes:");
+ for(ArrayModifiableDBIDs c : clusters) {
+ msg.append(' ').append(c.size());
+ }
+ msg.append('\n');
+ return true; // continue
}
- }
+ });
LOG.verbose(msg.toString());
}
// sort the clusters
- List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap);
+ List<Cluster<SubspaceModel>> clusters = sortClusters(database, clustersMap);
if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 3: sort clusters");
- for(Cluster<SubspaceModel<V>> c : clusters) {
- msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size());
+ for(Cluster<SubspaceModel> c : clusters) {
+ msg.append('\n').append(BitsUtil.toStringLow(c.getModel().getSubspace().getDimensions(), dimensionality)).append(" ids ").append(c.size());
}
LOG.verbose(msg.toString());
}
// build the hierarchy
- Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering");
- buildHierarchy(database, distFunc, clustering, clusters, dimensionality);
+ Clustering<SubspaceModel> clustering = new Clustering<>("DiSH clustering", "dish-clustering");
+ buildHierarchy(database, clustering, clusters, dimensionality);
if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 4: build hierarchy");
- for(Cluster<SubspaceModel<V>> c : clusters) {
- msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size());
- for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
+ for(Cluster<SubspaceModel> c : clusters) {
+ msg.append('\n').append(BitsUtil.toStringLow(c.getModel().getSubspace().getDimensions(), dimensionality)).append(" ids ").append(c.size());
+ for(Iter<Cluster<SubspaceModel>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
- for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<SubspaceModel>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
@@ -255,7 +228,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
// build result
- for(Cluster<SubspaceModel<V>> c : clusters) {
+ for(Cluster<SubspaceModel> c : clusters) {
if(clustering.getClusterHierarchy().numParents(c) == 0) {
clustering.addToplevelCluster(c);
}
@@ -267,37 +240,38 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
* Extracts the clusters from the cluster order.
*
* @param database the database storing the objects
- * @param distFunc the distance function
* @param clusterOrder the cluster order to extract the clusters from
* @return the extracted clusters
*/
- private Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> extractClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder) {
+ private TCustomHashMap<long[], List<ArrayModifiableDBIDs>> extractClusters(Relation<V> database, ClusterOrderResult<DiSHClusterOrderEntry> clusterOrder) {
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), LOG) : null;
int processed = 0;
- Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>();
- Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>();
- Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>();
- for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
- ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next();
+ TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = new TCustomHashMap<>(BitsUtil.TROVE_HASH_STRATEGY);
+ // Note clusterOrder currently contains DBID objects anyway.
+ Map<DBID, DiSHClusterOrderEntry> entryMap = new HashMap<>();
+ Map<DBID, Pair<long[], ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>();
+ for(Iterator<DiSHClusterOrderEntry> it = clusterOrder.iterator(); it.hasNext();) {
+ DiSHClusterOrderEntry entry = it.next();
entryMap.put(entry.getID(), entry);
V object = database.get(entry.getID());
- BitSet preferenceVector = entry.getReachability().getCommonPreferenceVector();
+ long[] preferenceVector = entry.getCommonPreferenceVector();
// get the list of (parallel) clusters for the preference vector
- List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector);
+ List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(preferenceVector);
if(parallelClusters == null) {
parallelClusters = new ArrayList<>();
clustersMap.put(preferenceVector, parallelClusters);
}
// look for the proper cluster
- Pair<BitSet, ArrayModifiableDBIDs> cluster = null;
- for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
- V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database);
- PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector);
- if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
- double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector());
+ ArrayModifiableDBIDs cluster = null;
+ for(ArrayModifiableDBIDs c : parallelClusters) {
+ Vector c_centroid = ProjectedCentroid.make(preferenceVector, database, c);
+ long[] commonPreferenceVector = BitsUtil.andCMin(preferenceVector, preferenceVector);
+ int subspaceDim = subspaceDimensionality(object, c_centroid, preferenceVector, preferenceVector, commonPreferenceVector);
+ if(subspaceDim == entry.getCorrelationValue()) {
+ double d = weightedDistance(object, c_centroid, commonPreferenceVector);
if(d <= 2 * epsilon) {
cluster = c;
break;
@@ -305,57 +279,56 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
if(cluster == null) {
- cluster = new Pair<>(preferenceVector, DBIDUtil.newArray());
+ cluster = DBIDUtil.newArray();
parallelClusters.add(cluster);
}
- cluster.second.add(entry.getID());
- entryToClusterMap.put(entry.getID(), cluster);
+ cluster.add(entry.getID());
+ entryToClusterMap.put(entry.getID(), new Pair<>(preferenceVector, cluster));
if(progress != null) {
progress.setProcessed(++processed, LOG);
}
}
- if(progress != null) {
- progress.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(progress);
if(LOG.isDebuggingFiner()) {
+ int dim = RelationUtil.dimensionality(database);
StringBuilder msg = new StringBuilder("Step 0");
- for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
- msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size());
+ for(Map.Entry<long[], List<ArrayModifiableDBIDs>> clusterList : clustersMap.entrySet()) {
+ for(ArrayModifiableDBIDs c : clusterList.getValue()) {
+ msg.append('\n').append(BitsUtil.toStringLow(clusterList.getKey(), dim)).append(" ids ").append(c.size());
}
}
LOG.debugFiner(msg.toString());
}
// add the predecessor to the cluster
- for(BitSet pv : clustersMap.keySet()) {
- List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
- if(cluster.second.isEmpty()) {
+ for(long[] pv : clustersMap.keySet()) {
+ List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
+ for(ArrayModifiableDBIDs cluster : parallelClusters) {
+ if(cluster.isEmpty()) {
continue;
}
- DBID firstID = cluster.second.get(0);
- ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID);
+ DBID firstID = cluster.get(0);
+ DiSHClusterOrderEntry entry = entryMap.get(firstID);
DBID predecessorID = entry.getPredecessorID();
if(predecessorID == null) {
continue;
}
- ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID);
+ DiSHClusterOrderEntry predecessor = entryMap.get(predecessorID);
// parallel cluster
- if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
+ if(BitsUtil.equal(predecessor.getCommonPreferenceVector(), entry.getCommonPreferenceVector())) {
continue;
}
- if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
+ if(predecessor.compareTo(entry) < 0) {
continue;
}
- Pair<BitSet, ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID);
+ Pair<long[], ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID);
oldCluster.second.remove(predecessorID);
- cluster.second.add(predecessorID);
+ cluster.add(predecessorID);
entryToClusterMap.remove(predecessorID);
- entryToClusterMap.put(predecessorID, cluster);
+ entryToClusterMap.put(predecessorID, new Pair<>(pv, cluster));
}
}
@@ -366,21 +339,21 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
* Returns a sorted list of the clusters w.r.t. the subspace dimensionality in
* descending order.
*
- * @param database the database storing the objects
+ * @param relation the database storing the objects
* @param clustersMap the mapping of bits sets to clusters
* @return a sorted list of the clusters
*/
- private List<Cluster<SubspaceModel<V>>> sortClusters(Relation<V> database, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) {
- final int db_dim = RelationUtil.dimensionality(database);
+ private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
+ final int db_dim = RelationUtil.dimensionality(relation);
// int num = 1;
- List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>();
- for(BitSet pv : clustersMap.keySet()) {
- List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
+ List<Cluster<SubspaceModel>> clusters = new ArrayList<>();
+ for(long[] pv : clustersMap.keySet()) {
+ List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
for(int i = 0; i < parallelClusters.size(); i++) {
- Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i);
- Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second);
- cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database)));
- String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, "");
+ ArrayModifiableDBIDs c = parallelClusters.get(i);
+ Cluster<SubspaceModel> cluster = new Cluster<>(c);
+ cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c)));
+ String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim);
if(parallelClusters.size() > 1) {
cluster.setName("Cluster_" + subspace + "_" + i);
}
@@ -391,12 +364,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
// sort the clusters w.r.t. lambda
- Comparator<Cluster<SubspaceModel<V>>> comparator = new Comparator<Cluster<SubspaceModel<V>>>() {
+ Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() {
@Override
- public int compare(Cluster<SubspaceModel<V>> c1, Cluster<SubspaceModel<V>> c2) {
+ public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) {
return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality();
}
-
};
Collections.sort(clusters, comparator);
return clusters;
@@ -406,32 +378,31 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
* Removes the clusters with size < minpts from the cluster map and adds them
* to their parents.
*
- * @param database the database storing the objects
- * @param distFunc the distance function
+ * @param relation the relation storing the objects
* @param clustersMap the map containing the clusters
- * @param minpts MinPts
*/
- private void checkClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap, int minpts) {
+ private void checkClusters(Relation<V> relation, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
+ final int dimensionality = RelationUtil.dimensionality(relation);
// check if there are clusters < minpts
// and add them to not assigned
- List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>();
- Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>();
- Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray());
- for(BitSet pv : clustersMap.keySet()) {
+ List<Pair<long[], ArrayModifiableDBIDs>> notAssigned = new ArrayList<>();
+ TCustomHashMap<long[], List<ArrayModifiableDBIDs>> newClustersMap = new TCustomHashMap<>(BitsUtil.TROVE_HASH_STRATEGY);
+ Pair<long[], ArrayModifiableDBIDs> noise = new Pair<>(BitsUtil.zero(dimensionality), DBIDUtil.newArray());
+ for(long[] pv : clustersMap.keySet()) {
// noise
- if(pv.cardinality() == 0) {
- List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
- noise.second.addDBIDs(c.second);
+ if(BitsUtil.cardinality(pv) == 0) {
+ List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
+ for(ArrayModifiableDBIDs c : parallelClusters) {
+ noise.second.addDBIDs(c);
}
}
// clusters
else {
- List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size());
- for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
- if(!pv.equals(new BitSet()) && c.second.size() < minpts) {
- notAssigned.add(c);
+ List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
+ List<ArrayModifiableDBIDs> newParallelClusters = new ArrayList<>(parallelClusters.size());
+ for(ArrayModifiableDBIDs c : parallelClusters) {
+ if(!BitsUtil.isZero(pv) && c.size() < mu) {
+ notAssigned.add(new Pair<>(pv, c));
}
else {
newParallelClusters.add(c);
@@ -444,11 +415,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
clustersMap.clear();
clustersMap.putAll(newClustersMap);
- for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
+ for(Pair<long[], ArrayModifiableDBIDs> c : notAssigned) {
if(c.second.isEmpty()) {
continue;
}
- Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap);
+ Pair<long[], ArrayModifiableDBIDs> parent = findParent(relation, c, clustersMap);
if(parent != null) {
parent.second.addDBIDs(c.second);
}
@@ -457,30 +428,29 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
- List<Pair<BitSet, ArrayModifiableDBIDs>> noiseList = new ArrayList<>(1);
- noiseList.add(noise);
+ List<ArrayModifiableDBIDs> noiseList = new ArrayList<>(1);
+ noiseList.add(noise.second);
clustersMap.put(noise.first, noiseList);
}
/**
* Returns the parent of the specified cluster
*
- * @param database the database storing the objects
- * @param distFunc the distance function
+ * @param relation the relation storing the objects
* @param child the child to search the parent for
* @param clustersMap the map containing the clusters
* @return the parent of the specified cluster
*/
- private Pair<BitSet, ArrayModifiableDBIDs> findParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Pair<BitSet, ArrayModifiableDBIDs> child, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) {
- V child_centroid = ProjectedCentroid.make(child.first, database, child.second).toVector(database);
+ private Pair<long[], ArrayModifiableDBIDs> findParent(Relation<V> relation, Pair<long[], ArrayModifiableDBIDs> child, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
+ Vector child_centroid = ProjectedCentroid.make(child.first, relation, child.second);
- Pair<BitSet, ArrayModifiableDBIDs> result = null;
+ Pair<long[], ArrayModifiableDBIDs> result = null;
int resultCardinality = -1;
- BitSet childPV = child.first;
- int childCardinality = childPV.cardinality();
- for(BitSet parentPV : clustersMap.keySet()) {
- int parentCardinality = parentPV.cardinality();
+ long[] childPV = child.first;
+ int childCardinality = BitsUtil.cardinality(childPV);
+ for(long[] parentPV : clustersMap.keySet()) {
+ int parentCardinality = BitsUtil.cardinality(parentPV);
if(parentCardinality >= childCardinality) {
continue;
}
@@ -488,15 +458,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
continue;
}
- BitSet pv = (BitSet) childPV.clone();
- pv.and(parentPV);
+ long[] pv = BitsUtil.andCMin(childPV, parentPV);
if(pv.equals(parentPV)) {
- List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV);
- for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
- V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database);
- double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV);
+ List<ArrayModifiableDBIDs> parentList = clustersMap.get(parentPV);
+ for(ArrayModifiableDBIDs parent : parentList) {
+ Vector parent_centroid = ProjectedCentroid.make(parentPV, relation, parent);
+ double d = weightedDistance(child_centroid, parent_centroid, parentPV);
if(d <= 2 * epsilon) {
- result = parent;
+ result = new Pair<>(parentPV, parent);
resultCardinality = parentCardinality;
break;
}
@@ -510,65 +479,70 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
/**
* Builds the cluster hierarchy.
*
- * @param distFunc the distance function
* @param clustering Clustering we process
* @param clusters the sorted list of clusters
* @param dimensionality the dimensionality of the data
* @param database the database containing the data objects
*/
- private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Clustering<SubspaceModel<V>> clustering, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) {
- StringBuilder msg = new StringBuilder();
+ private void buildHierarchy(Relation<V> database, Clustering<SubspaceModel> clustering, List<Cluster<SubspaceModel>> clusters, int dimensionality) {
+ StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
final int db_dim = RelationUtil.dimensionality(database);
- Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy();
+ Hierarchy<Cluster<SubspaceModel>> hier = clustering.getClusterHierarchy();
for(int i = 0; i < clusters.size() - 1; i++) {
- Cluster<SubspaceModel<V>> c_i = clusters.get(i);
- int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality();
- V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database);
+ Cluster<SubspaceModel> c_i = clusters.get(i);
+ final Subspace s_i = c_i.getModel().getSubspace();
+ int subspaceDim_i = dimensionality - s_i.dimensionality();
+ Vector ci_centroid = ProjectedCentroid.make(s_i.getDimensions(), database, c_i.getIDs());
+ long[] pv1 = s_i.getDimensions();
for(int j = i + 1; j < clusters.size(); j++) {
- Cluster<SubspaceModel<V>> c_j = clusters.get(j);
- int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality();
+ Cluster<SubspaceModel> c_j = clusters.get(j);
+ final Subspace s_j = c_j.getModel().getSubspace();
+ int subspaceDim_j = dimensionality - s_j.dimensionality();
if(subspaceDim_i < subspaceDim_j) {
- if(LOG.isDebugging()) {
- msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']');
- msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']');
+ if(msg != null) {
+ msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)).append(']');
+ msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)).append(']');
}
// noise level reached
- if(c_j.getModel().getSubspace().dimensionality() == 0) {
+ if(s_j.dimensionality() == 0) {
// no parents exists -> parent is noise
if(hier.numParents(c_i) == 0) {
clustering.addChildCluster(c_j, c_i);
- if(LOG.isDebugging()) {
- msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
- msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
+ if(msg != null) {
+ msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim));
+ msg.append("] is parent of [").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim));
msg.append(']');
}
}
}
else {
- V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database);
- PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions());
- double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector());
- if(LOG.isDebugging()) {
- msg.append("\n dist = ").append(distance.getCorrelationValue());
+ Vector cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs());
+ long[] pv2 = s_j.getDimensions();
+ long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2);
+ int subspaceDim = subspaceDimensionality(ci_centroid, cj_centroid, pv1, pv2, commonPreferenceVector);
+
+ double d = weightedDistance(ci_centroid, cj_centroid, commonPreferenceVector);
+ if(msg != null) {
+ msg.append("\n dist = ").append(subspaceDim);
}
- if(distance.getCorrelationValue() == subspaceDim_j) {
- if(LOG.isDebugging()) {
+ if(subspaceDim == subspaceDim_j) {
+ if(msg != null) {
msg.append("\n d = ").append(d);
}
if(d <= 2 * epsilon) {
// no parent exists or c_j is not a parent of the already
// existing parents
- if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
+ if(hier.numParents(c_i) == 0 || !isParent(database, c_j, hier.iterParents(c_i), db_dim)) {
clustering.addChildCluster(c_j, c_i);
- if(LOG.isDebugging()) {
- msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
+ if(msg != null) {
+ msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim));
msg.append("] is parent of [");
- msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
+ msg.append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim));
msg.append(']');
}
}
@@ -581,7 +555,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
}
- if(LOG.isDebugging()) {
+ if(msg != null) {
LOG.debug(msg.toString());
}
}
@@ -590,31 +564,75 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
* Returns true, if the specified parent cluster is a parent of one child of
* the children clusters.
*
- * @param database the database containing the objects
- * @param distFunc the distance function for distance computation between the
- * clusters
+ * @param relation the database containing the objects
* @param parent the parent to be tested
* @param iter the list of children to be tested
+ * @param db_dim Database dimensionality
* @return true, if the specified parent cluster is a parent of one child of
* the children clusters, false otherwise
*/
- private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, Iter<Cluster<SubspaceModel<V>>> iter) {
- V parent_centroid = ProjectedCentroid.make(parent.getModel().getDimensions(), database, parent.getIDs()).toVector(database);
- int dimensionality = RelationUtil.dimensionality(database);
- int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality();
+ private boolean isParent(Relation<V> relation, Cluster<SubspaceModel> parent, Iter<Cluster<SubspaceModel>> iter, int db_dim) {
+ Subspace s_p = parent.getModel().getSubspace();
+ Vector parent_centroid = ProjectedCentroid.make(s_p.getDimensions(), relation, parent.getIDs());
+ int subspaceDim_parent = db_dim - s_p.dimensionality();
for(; iter.valid(); iter.advance()) {
- Cluster<SubspaceModel<V>> child = iter.get();
- V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database);
- PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions());
- if(distance.getCorrelationValue() == subspaceDim_parent) {
+ Cluster<SubspaceModel> child = iter.get();
+ Subspace s_c = child.getModel().getSubspace();
+ Vector child_centroid = ProjectedCentroid.make(s_c.getDimensions(), relation, child.getIDs());
+ long[] commonPreferenceVector = BitsUtil.andCMin(s_p.getDimensions(), s_c.getDimensions());
+ int subspaceDim = subspaceDimensionality(parent_centroid, child_centroid, s_p.getDimensions(), s_c.getDimensions(), commonPreferenceVector);
+ if(subspaceDim == subspaceDim_parent) {
return true;
}
}
-
return false;
}
+ /**
+ * Compute the common subspace dimensionality of two vectors.
+ *
+ * @param v1 First vector
+ * @param v2 Second vector
+ * @param pv1 First preference
+ * @param pv2 Second preference
+ * @param commonPreferenceVector Common preference
+ * @return Usually, v1.dim - commonPreference.cardinality, unless either pv1
+ * and pv2 are a subset of the other.
+ */
+ private int subspaceDimensionality(NumberVector v1, NumberVector v2, long[] pv1, long[] pv2, long[] commonPreferenceVector) {
+ // number of zero values in commonPreferenceVector
+ int subspaceDim = v1.getDimensionality() - BitsUtil.cardinality(commonPreferenceVector);
+
+ // special case: v1 and v2 are in parallel subspaces
+ if(BitsUtil.equal(commonPreferenceVector, pv1) || BitsUtil.equal(commonPreferenceVector, pv2)) {
+ double d = weightedDistance(v1, v2, commonPreferenceVector);
+ if(d > 2 * epsilon) {
+ subspaceDim++;
+ }
+ }
+ return subspaceDim;
+ }
+
+ /**
+ * Computes the weighted distance between the two specified vectors according
+ * to the given preference vector.
+ *
+ * @param v1 the first vector
+ * @param v2 the second vector
+ * @param weightVector the preference vector
+ * @return the weighted distance between the two specified vectors according
+ * to the given preference vector
+ */
+ protected static double weightedDistance(NumberVector v1, NumberVector v2, long[] weightVector) {
+ double sqrDist = 0;
+ for(int i = BitsUtil.nextSetBit(weightVector, 0); i >= 0; i = BitsUtil.nextSetBit(weightVector, i + 1)) {
+ double manhattanI = v1.doubleValue(i) - v2.doubleValue(i);
+ sqrDist += manhattanI * manhattanI;
+ }
+ return Math.sqrt(sqrDist);
+ }
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
@@ -626,20 +644,199 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
/**
+ * OPTICS variant used by DiSH internally.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public class DiSHOPTICS extends GeneralizedOPTICS<V, DiSHClusterOrderEntry> {
+ /**
+ * DiSH preprocessor instance.
+ */
+ private DiSHPreferenceVectorIndex<V> index;
+
+ /**
+ * Constructor.
+ *
+ * @param indexinst Preprocessor instance.
+ */
+ public DiSHOPTICS(DiSHPreferenceVectorIndex<V> indexinst) {
+ super(mu);
+ this.index = indexinst;
+ }
+
+ @Override
+ public Class<? super DiSHClusterOrderEntry> getEntryType() {
+ return DiSHClusterOrderEntry.class;
+ }
+
+ @Override
+ protected DiSHClusterOrderEntry makeSeedEntry(Relation<V> relation, DBID objectID) {
+ return new DiSHClusterOrderEntry(objectID, null, Integer.MAX_VALUE, Double.POSITIVE_INFINITY, new long[0]);
+ }
+
+ @Override
+ protected Collection<DiSHClusterOrderEntry> getNeighborsForDBID(Relation<V> relation, DBID id) {
+ DBID id1 = DBIDUtil.deref(id);
+ long[] pv1 = index.getPreferenceVector(id1);
+ V dv1 = relation.get(id1);
+ final int dim = dv1.getDimensionality();
+
+ long[] ones = BitsUtil.ones(dim);
+ long[] inverseCommonPreferenceVector = BitsUtil.ones(dim);
+
+ ArrayList<DiSHClusterOrderEntry> result = new ArrayList<>();
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ long[] pv2 = index.getPreferenceVector(iter);
+ V dv2 = relation.get(iter);
+ // We need a copy of this for the distance.
+ long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2);
+
+ // number of zero values in commonPreferenceVector
+ int subspaceDim = dim - BitsUtil.cardinality(commonPreferenceVector);
+
+ // special case: v1 and v2 are in parallel subspaces
+ if(BitsUtil.equal(commonPreferenceVector, pv1) || BitsUtil.equal(commonPreferenceVector, pv2)) {
+ double d = weightedDistance(dv1, dv2, commonPreferenceVector);
+ if(d > 2 * epsilon) {
+ subspaceDim++;
+ }
+ }
+
+ // flip commonPreferenceVector for distance computation in common
+ // subspace
+ System.arraycopy(ones, 0, inverseCommonPreferenceVector, 0, ones.length);
+ BitsUtil.xorI(inverseCommonPreferenceVector, commonPreferenceVector);
+
+ final double orthogonalDistance = weightedDistance(dv1, dv2, inverseCommonPreferenceVector);
+ result.add(new DiSHClusterOrderEntry(DBIDUtil.deref(iter), id1, subspaceDim, orthogonalDistance, commonPreferenceVector));
+ }
+ Collections.sort(result);
+ // This is a hack, but needed to enforce core-distance of OPTICS:
+ if(result.size() >= getMinPts()) {
+ DiSHClusterOrderEntry coredist = result.get(getMinPts() - 1);
+ for(int i = 0; i < getMinPts() - 1; i++) {
+ final DiSHClusterOrderEntry prev = result.get(i);
+ result.set(i, new DiSHClusterOrderEntry(prev.getID(), id1, coredist.getCorrelationValue(), coredist.getEuclideanValue(), coredist.commonPreferenceVector));
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.DOUBLE_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+ }
+
+ /**
+ * Cluster order entry for DiSH.
+ *
+ * @author Elke Achtert
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class DiSHClusterOrderEntry extends CorrelationClusterOrderEntry<DiSHClusterOrderEntry> {
+ /**
+ * The common preference vector of the two objects defining this distance.
+ */
+ private long[] commonPreferenceVector;
+
+ /**
+ * Constructs a new CorrelationDistance object.
+ *
+ * @param correlationValue the correlation dimension to be represented by
+ * the CorrelationDistance
+ * @param euclideanValue the Euclidean distance to be represented by the
+ * CorrelationDistance
+ * @param commonPreferenceVector the common preference vector of the two
+ * objects defining this distance
+ */
+ public DiSHClusterOrderEntry(DBID objectID, DBID predecessorID, int correlationValue, double euclideanValue, long[] commonPreferenceVector) {
+ super(objectID, predecessorID, correlationValue, euclideanValue);
+ this.commonPreferenceVector = commonPreferenceVector;
+ }
+
+ /**
+ * Returns the common preference vector of the two objects defining this
+ * distance.
+ *
+ * @return the common preference vector
+ */
+ public long[] getCommonPreferenceVector() {
+ return commonPreferenceVector;
+ }
+
+ /**
+ * Returns a string representation of this
+ * PreferenceVectorBasedCorrelationDistance.
+ *
+ * @return the correlation value, the Euclidean value and the common
+ * preference vector separated by blanks
+ */
+ @Override
+ public String toString() {
+ return super.toString() + SEPARATOR + commonPreferenceVector.toString();
+ }
+
+ @Override
+ public int compareTo(DiSHClusterOrderEntry other) {
+ return super.compareTo(other);
+ }
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter that specifies the maximum radius of the neighborhood to be
+ * considered in each dimension for determination of the preference vector,
+ * must be a double equal to or greater than 0.
+ * <p>
+ * Default value: {@code 0.001}
+ * </p>
+ * <p>
+ * Key: {@code -dish.epsilon}
+ * </p>
+ */
+ public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", //
+ "The maximum radius of the neighborhood to be considered in each " //
+ + " dimension for determination of the preference vector.");
+
+ /**
+ * Parameter that specifies the a minimum number of points as a smoothing
+ * factor to avoid the single-link-effect, must be an integer greater than
+ * 0.
+ * <p>
+ * Default value: {@code 1}
+ * </p>
+ * <p>
+ * Key: {@code -dish.mu}
+ * </p>
+ */
+ public static final OptionID MU_ID = new OptionID("dish.mu", //
+ "The minimum number of points as a smoothing factor to avoid the single-link-effekt.");
+
protected double epsilon = 0.0;
protected int mu = 1;
- protected DiSHDistanceFunction dishDistance;
-
- protected Collection<Pair<OptionID, Object>> opticsO;
+ /**
+ * DiSH preprocessor.
+ */
+ protected DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor;
@Override
protected void makeOptions(Parameterization config) {
@@ -657,53 +854,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
mu = muP.intValue();
}
- configDiSHDistance(config, epsilon, mu);
-
- configOPTICS(config, mu, dishDistance);
+ configDiSHPreprocessor(config, epsilon, mu);
}
- public void configDiSHDistance(Parameterization config, double epsilon, int minpts) {
+ public void configDiSHPreprocessor(Parameterization config, double epsilon, int minpts) {
ListParameterization dishParameters = new ListParameterization();
- dishParameters.addParameter(DiSHDistanceFunction.EPSILON_ID, epsilon);
- dishParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, DiSHPreferenceVectorIndex.Factory.class);
- dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, Double.toString(epsilon));
+ dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, epsilon);
dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.MINPTS_ID, minpts);
ChainedParameterization dishchain = new ChainedParameterization(dishParameters, config);
dishchain.errorsTo(config);
- dishDistance = dishchain.tryInstantiate(DiSHDistanceFunction.class);
- }
-
- /**
- * Get the parameters for embedded OPTICS.
- *
- * @param config Parameterization
- * @param minpts MinPts value
- * @param dishDistance DiSH distance function
- */
- public void configOPTICS(Parameterization config, final int minpts, final DiSHDistanceFunction dishDistance) {
- // Configure OPTICS. Tracked parameters
- ListParameterization opticsParameters = new ListParameterization();
- opticsParameters.addParameter(OPTICS.EPSILON_ID, AbstractDistance.INFINITY_PATTERN);
- opticsParameters.addParameter(OPTICS.MINPTS_ID, minpts);
- // Configure OPTICS. Untracked parameters
- ListParameterization opticsUntrackedParameters = new ListParameterization();
- opticsUntrackedParameters.addParameter(OPTICS.DISTANCE_FUNCTION_ID, dishDistance);
- ChainedParameterization optchain = new ChainedParameterization(opticsParameters, config);
- TrackParameters trackpar = new TrackParameters(optchain);
-
- ChainedParameterization optchain2 = new ChainedParameterization(opticsUntrackedParameters, trackpar);
- optchain2.errorsTo(config);
-
- // Instantiate OPTICS for parameterization
- optchain2.tryInstantiate(OPTICS.class);
- // store parameters
- opticsO = trackpar.getGivenParameters();
+ final Class<DiSHPreferenceVectorIndex.Factory<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DiSHPreferenceVectorIndex.Factory.class);
+ dishPreprocessor = dishchain.tryInstantiate(cls);
}
@Override
protected DiSH<V> makeInstance() {
- return new DiSH<>(epsilon, dishDistance, opticsO);
+ return new DiSH<>(epsilon, mu, dishPreprocessor);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
index 3f135564..b0ad5eff 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
@@ -1,39 +1,53 @@
package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
-/*
-This file is part of ELKI:
-Environment for Developing KDD-Applications Supported by Index-Structures
-
-Copyright (C) 2013
-Ludwig-Maximilians-Universität München
-Lehr- und Forschungseinheit für Datenbanksysteme
-ELKI Development Team
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.ClusterOrderResult;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.CorrelationClusterOrderEntry;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.GeneralizedOPTICS;
import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.HiSCDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelationDistance;
+import de.lmu.ifi.dbs.elki.index.IndexFactory;
import de.lmu.ifi.dbs.elki.index.preprocessed.preference.HiSCPreferenceVectorIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
@@ -53,26 +67,135 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
* @author Elke Achtert
*
* @apiviz.uses HiSCPreferenceVectorIndex
- * @apiviz.uses HiSCDistanceFunction
*
* @param <V> the type of NumberVector handled by the algorithm
*/
@Title("Finding Hierarchies of Subspace Clusters")
@Description("Algorithm for detecting hierarchies of subspace clusters.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Finding Hierarchies of Subspace Clusters", booktitle = "Proc. 10th Europ. Conf. on Principles and Practice of Knowledge Discovery in Databases (PKDD'06), Berlin, Germany, 2006", url = "http://www.dbs.ifi.lmu.de/Publikationen/Papers/PKDD06-HiSC.pdf")
-public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorBasedCorrelationDistance> {
+public class HiSC<V extends NumberVector> extends GeneralizedOPTICS<V, HiSC.HiSCClusterOrderEntry> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(HiSC.class);
/**
+ * Factory to produce
+ */
+ private IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>> indexfactory;
+
+ /**
+ * Instantiated index.
+ */
+ private HiSCPreferenceVectorIndex<NumberVector> index;
+
+ /**
+ * Relation we are currently processing.
+ */
+ private Relation<V> relation;
+
+ /**
+ * Holds the maximum diversion allowed.
+ */
+ private double alpha;
+
+ /**
* Constructor.
- *
- * @param distanceFunction HiSC distance function used
+ *
+ * @param indexfactory HiSC index factory
+ */
+ public HiSC(IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>> indexfactory, double epsilon) {
+ super(2);
+ this.indexfactory = indexfactory;
+ this.alpha = epsilon;
+ }
+
+ @Override
+ public ClusterOrderResult<HiSCClusterOrderEntry> run(Relation<V> relation) {
+ assert (this.index == null && this.relation == null) : "Running algorithm instance multiple times in parallel is not supported.";
+ this.index = indexfactory.instantiate(relation);
+ this.relation = relation;
+ ClusterOrderResult<HiSCClusterOrderEntry> result = super.run(relation);
+ this.index = null;
+ this.relation = null;
+ return result;
+ }
+
+ @Override
+ protected HiSCClusterOrderEntry makeSeedEntry(Relation<V> relation, DBID objectID) {
+ return new HiSCClusterOrderEntry(objectID, null, Integer.MAX_VALUE, Double.POSITIVE_INFINITY, new long[0]);
+ }
+
+ @Override
+ protected Collection<HiSCClusterOrderEntry> getNeighborsForDBID(Relation<V> relation, DBID id) {
+ DBID id1 = DBIDUtil.deref(id);
+ long[] pv1 = index.getPreferenceVector(id1);
+ V v1 = relation.get(id1);
+ final int dim = v1.getDimensionality();
+
+ ArrayList<HiSCClusterOrderEntry> result = new ArrayList<>();
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ long[] pv2 = index.getPreferenceVector(iter);
+ V v2 = relation.get(iter);
+ final long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2);
+
+ // number of zero values in commonPreferenceVector
+ int subspaceDim = dim - BitsUtil.cardinality(commonPreferenceVector);
+
+ // special case: v1 and v2 are in parallel subspaces
+ double dist1 = weightedDistance(v1, v2, pv1);
+ double dist2 = weightedDistance(v1, v2, pv2);
+
+ if(Math.max(dist1, dist2) > alpha) {
+ subspaceDim++;
+ if(LOG.isDebugging()) {
+ StringBuilder msg = new StringBuilder();
+ msg.append("\ndist1 ").append(dist1);
+ msg.append("\ndist2 ").append(dist2);
+ msg.append("\nsubspaceDim ").append(subspaceDim);
+ msg.append("\ncommon pv ").append(BitsUtil.toStringLow(commonPreferenceVector, dim));
+ LOG.debugFine(msg.toString());
+ }
+ }
+
+ // flip commonPreferenceVector for distance computation in common subspace
+ long[] inverseCommonPreferenceVector = BitsUtil.ones(dim);
+ BitsUtil.xorI(inverseCommonPreferenceVector, commonPreferenceVector);
+
+ final double orthogonalDistance = weightedDistance(v1, v2, inverseCommonPreferenceVector);
+ result.add(new HiSCClusterOrderEntry(DBIDUtil.deref(iter), id1, subspaceDim, orthogonalDistance, commonPreferenceVector));
+ }
+ Collections.sort(result);
+ return result;
+ }
+
+ /**
+ * Computes the weighted distance between the two specified vectors according
+ * to the given preference vector.
+ *
+ * @param v1 the first vector
+ * @param v2 the second vector
+ * @param weightVector the preference vector
+ * @return the weighted distance between the two specified vectors according
+ * to the given preference vector
*/
- public HiSC(HiSCDistanceFunction<V> distanceFunction) {
- super(distanceFunction, distanceFunction.getDistanceFactory().infiniteDistance(), 2);
+ public double weightedDistance(V v1, V v2, long[] weightVector) {
+ double sqrDist = 0.;
+ for(int i = BitsUtil.nextSetBit(weightVector, 0); i >= 0; i = BitsUtil.nextSetBit(weightVector, i + 1)) {
+ double manhattanI = v1.doubleValue(i) - v2.doubleValue(i);
+ sqrDist += manhattanI * manhattanI;
+ }
+ return Math.sqrt(sqrDist);
+ }
+
+ @Override
+ public Class<? super HiSCClusterOrderEntry> getEntryType() {
+ return HiSCClusterOrderEntry.class;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(indexfactory.getInputTypeRestriction());
}
@Override
@@ -81,22 +204,99 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB
}
/**
+ * Cluster order entry for HiSC.
+ *
+ * @author Elke Achtert
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class HiSCClusterOrderEntry extends CorrelationClusterOrderEntry<HiSCClusterOrderEntry> {
+ /**
+ * The common preference vector of the two objects defining this distance.
+ */
+ private long[] commonPreferenceVector;
+
+ /**
+ * Constructs a new CorrelationDistance object.
+ *
+ * @param correlationValue the correlation dimension to be represented by
+ * the CorrelationDistance
+ * @param euclideanValue the Euclidean distance to be represented by the
+ * CorrelationDistance
+ * @param commonPreferenceVector the common preference vector of the two
+ * objects defining this distance
+ */
+ public HiSCClusterOrderEntry(DBID objectID, DBID predecessorID, int correlationValue, double euclideanValue, long[] commonPreferenceVector) {
+ super(objectID, predecessorID, correlationValue, euclideanValue);
+ this.commonPreferenceVector = commonPreferenceVector;
+ }
+
+ /**
+ * Returns the common preference vector of the two objects defining this
+ * distance.
+ *
+ * @return the common preference vector
+ */
+ public long[] getCommonPreferenceVector() {
+ return commonPreferenceVector;
+ }
+
+ /**
+ * Returns a string representation of this
+ * PreferenceVectorBasedCorrelationDistance.
+ *
+ * @return the correlation value, the Euclidean value and the common
+ * preference vector separated by blanks
+ */
+ @Override
+ public String toString() {
+ return super.toString() + SEPARATOR + commonPreferenceVector.toString();
+ }
+
+ @Override
+ public int compareTo(HiSCClusterOrderEntry other) {
+ return super.compareTo(other);
+ }
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
- HiSCDistanceFunction<V> distanceFunction;
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the maximum distance between two vectors with equal
+ * preference vectors before considering them as parallel, must be a double
+ * equal to or greater than 0.
+ * <p>
+ * Default value: {@code 0.001}
+ * </p>
+ * <p>
+ * Key: {@code -hisc.epsilon}
+ * </p>
+ */
+ public static final OptionID EPSILON_ID = new OptionID("hisc.epsilon", "The maximum distance between two vectors with equal preference vectors before considering them as parallel.");
+
+ /**
+ * Factory to produce the index.
+ */
+ private IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>> indexfactory;
+
+ /**
+ * Alpha parameter.
+ */
+ double alpha;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
- alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
- double alpha = 0.0;
if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
@@ -104,21 +304,19 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB
// Configure HiSC distance function
ListParameterization opticsParameters = new ListParameterization();
- // distance function
- opticsParameters.addParameter(HiSCDistanceFunction.EPSILON_ID, alpha);
// preprocessor
opticsParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, HiSCPreferenceVectorIndex.Factory.class);
opticsParameters.addParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, alpha);
ChainedParameterization chain = new ChainedParameterization(opticsParameters, config);
chain.errorsTo(config);
- Class<HiSCDistanceFunction<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(HiSCDistanceFunction.class);
- distanceFunction = chain.tryInstantiate(cls);
+ final Class<? extends IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>>> cls = ClassGenericsUtil.uglyCrossCast(HiSCPreferenceVectorIndex.Factory.class, IndexFactory.class);
+ indexfactory = chain.tryInstantiate(cls);
}
@Override
protected HiSC<V> makeInstance() {
- return new HiSC<>(distanceFunction);
+ return new HiSC<>(indexfactory, alpha);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
index 9d1ee94d..4e1de7c7 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,12 +25,13 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.EM;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.EMClusterModel;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -61,7 +62,6 @@ import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
@@ -103,7 +103,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*/
@Title("P3C: A Robust Projected Clustering Algorithm.")
@Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
-public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+public class P3C<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
@@ -156,7 +156,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
/**
* Performs the P3C algorithm on the given Database.
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
@@ -167,7 +167,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
// Desired number of bins, as per Sturge:
- final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+ final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
@@ -178,7 +178,6 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
- int numuniform = 0;
for(int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if(parts == null) {
@@ -191,7 +190,6 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if(bestBin < 0) {
- numuniform++;
break; // Uniform
}
BitsUtil.setI(marked, bestBin);
@@ -224,9 +222,9 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
if(clusterCores.size() == 0) {
- stepProgress.setCompleted(LOG);
- Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
- c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+ LOG.setCompleted(stepProgress);
+ Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
+ c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
@@ -238,26 +236,19 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
- double[] clusterWeights = new double[k];
- computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+ List<MultivariateGaussianModel> models = new ArrayList<>(k);
+ computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
- Vector[] means = new Vector[k];
- Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
- final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
- double[] normDistrFactor = new double[k];
- Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
- EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
- EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
- assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
-
- double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
+ assignUnassigned(relation, probClusterIGivenX, models, noise);
+
+ double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
- EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
- EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
// reassign probabilities
- emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if(LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
@@ -283,7 +274,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
- findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+ findOutliers(relation, models, clusterCandidates, noise);
if(stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
@@ -312,20 +303,18 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
// Generate final output.
- Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+ Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
- result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+ result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if(noise.size() > 0) {
- result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+ result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
- if(stepProgress != null) {
- stepProgress.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(stepProgress);
return result;
}
@@ -500,7 +489,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param parts Parts array
* @param start Array start index
* @param end Array end index (exclusive)
- * @return
+ * @return Union
*/
protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
int sum = 0;
@@ -561,12 +550,15 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param clusterCores the cluster cores.
* @param unassigned set to which to add unassigned points.
* @param probClusterIGivenX Membership probabilities.
- * @param clusterWeights Cluster weights
+ * @param models Cluster models.
+ * @param dim Dimensionality
*/
- private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+ private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, int dim) {
final int n = relation.size();
+ final double pweight = 1. / n; // Weight of each point
final int k = clusterCores.size();
+ double[] clusterWeights = new double[k];
for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
int count = 0;
double[] weights = new double[k];
@@ -581,7 +573,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
if(count > 0) {
// Rescale.
VMath.timesEquals(weights, 1. / count);
- VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+ VMath.plusTimesEquals(clusterWeights, weights, pweight);
}
else {
// Does not match any cluster, mark it.
@@ -589,6 +581,10 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
}
probClusterIGivenX.put(iter, weights);
}
+ final double f = MathUtil.powi(MathUtil.TWOPI, dim); // Scaling coefficient
+ for(int i = 0; i < k; i++) {
+ models.add(new MultivariateGaussianModel(clusterWeights[i], new Vector(dim), f));
+ }
}
/**
@@ -597,36 +593,44 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
*
* @param relation Data relation
* @param probClusterIGivenX fuzzy membership matrix.
- * @param means Cluster means.
- * @param invCovMatr Cluster covariance matrices.
- * @param clusterWeights
- * @param assigned mapping of matrix row to DBID.
+ * @param models Cluster models.
* @param unassigned the list of points not yet assigned.
*/
- private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+ private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, ModifiableDBIDs unassigned) {
if(unassigned.size() == 0) {
return;
}
- final int k = means.length;
+ final int k = models.size();
double pweight = 1. / relation.size();
+ // Rescale weights, to take unassigned points into account:
+ for(EMClusterModel<?> m : models) {
+ m.setWeight(m.getWeight() * (relation.size() - unassigned.size()) * pweight);
+ }
+
+ // Assign noise objects, increase weights accordingly.
for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
// Find the best matching known cluster core using the Mahalanobis
// distance.
- Vector v = relation.get(iter).getColumnVector();
+ V v = relation.get(iter);
int bestCluster = -1;
+ MultivariateGaussianModel bestModel = null;
double minDistance = Double.POSITIVE_INFINITY;
- for(int c = 0; c < k; ++c) {
- final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+ int c = 0;
+ for(MultivariateGaussianModel model : models) {
+ final double distance = model.mahalanobisDistance(v);
if(distance < minDistance) {
minDistance = distance;
bestCluster = c;
+ bestModel = model;
}
+ c++;
}
// Assign to best core.
double[] weights = new double[k];
- weights[bestCluster] = 1.0;
- clusterWeights[bestCluster] += pweight;
+ weights[bestCluster] = 1.;
+
+ bestModel.setWeight(bestModel.getWeight() + pweight);
probClusterIGivenX.put(iter, weights);
}
@@ -675,28 +679,19 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* attributes.
*
* @param relation Data relation
- * @param means Cluster means
- * @param invCovMatr Inverse covariance matrixes
+ * @param models Cluster models
* @param clusterCandidates the list of clusters to check.
- * @param nonUniformDimensionCount the number of dimensions to consider when
- * testing.
* @param noise the set to which to add points deemed outliers.
*/
- private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
- final int k = clusterCandidates.size();
-
- for(int c = 0; c < k; ++c) {
+ private void findOutliers(Relation<V> relation, List<MultivariateGaussianModel> models, ArrayList<ClusterCandidate> clusterCandidates, ModifiableDBIDs noise) {
+ Iterator<MultivariateGaussianModel> it = models.iterator();
+ for(int c = 0; it.hasNext(); c++) {
+ MultivariateGaussianModel model = it.next();
final ClusterCandidate candidate = clusterCandidates.get(c);
- if(candidate.ids.size() < 2) {
- continue;
- }
- final int dof = candidate.dimensions.cardinality();
- final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+ final int dof = BitsUtil.cardinality(candidate.dimensions);
+ final double threshold = ChiSquaredDistribution.quantile(1 - alpha, dof);
for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
- final Vector mean = means[c];
- final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
- final Matrix invCov = invCovMatr[c];
- final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+ final double distance = model.mahalanobisDistance(relation.get(iter));
if(distance >= threshold) {
// Outlier, remove it and add it to the outlier set.
noise.add(iter);
@@ -851,7 +846,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
/**
* Selected dimensions
*/
- public final BitSet dimensions;
+ public final long[] dimensions;
/**
* Objects contained in cluster.
@@ -864,9 +859,9 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
* @param clusterCore Signature
*/
public ClusterCandidate(Signature clusterCore) {
- this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+ this.dimensions = BitsUtil.zero(clusterCore.spec.length >> 1);
for(int i = 0; i < clusterCore.spec.length; i += 2) {
- this.dimensions.set(i >> 1);
+ BitsUtil.setI(this.dimensions, i >> 1);
}
this.ids = DBIDUtil.newArray(clusterCore.ids.size());
}
@@ -889,7 +884,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Parameter for the chi squared test threshold.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
index 03e9978f..805281b0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,7 +28,6 @@ import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import java.util.ArrayList;
-import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
@@ -44,25 +43,32 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultUtil;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.Mean;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -71,13 +77,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.CTriple;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
/**
* <p/>
- * Provides the PROCLUS algorithm, an algorithm to find subspace clusters in
- * high dimensional spaces.
+ * The PROCLUS algorithm, an algorithm to find subspace clusters in high
+ * dimensional spaces.
* </p>
* <p/>
* Reference: <br>
@@ -92,30 +97,20 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
-// TODO: optimize by creating much less objects
@Title("PROCLUS: PROjected CLUStering")
@Description("Algorithm to find subspace clusters in high dimensional spaces.")
-@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188")
-public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", //
+title = "Fast Algorithms for Projected Clustering", //
+booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", //
+url = "http://dx.doi.org/10.1145/304181.304188")
+public class PROCLUS<V extends NumberVector> extends AbstractProjectedClustering<Clustering<SubspaceModel>, V> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(PROCLUS.class);
/**
- * Parameter to specify the multiplier for the initial number of medoids, must
- * be an integer greater than 0.
- * <p>
- * Default value: {@code 10}
- * </p>
- * <p>
- * Key: {@code -proclus.mi}
- * </p>
- */
- public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids.");
-
- /**
- * Holds the value of {@link #M_I_ID}.
+ * Multiplier for the initial number of medoids.
*/
private int m_i;
@@ -145,27 +140,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param database Database to process
* @param relation Relation to process
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
- DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database);
- RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
+ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
+ DistanceQuery<V> distFunc = this.getDistanceQuery(database);
+ RangeQuery<V> rangeQuery = database.getRangeQuery(distFunc);
final Random random = rnd.getSingleThreadedRandom();
- if (RelationUtil.dimensionality(relation) < l) {
+ if(RelationUtil.dimensionality(relation) < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")");
}
// TODO: use a StepProgress!
// initialization phase
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("1. Initialization phase...");
}
int sampleSize = Math.min(relation.size(), k_i * k);
DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random.nextLong());
int medoidSize = Math.min(relation.size(), m_i * k);
- DBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random);
+ ArrayDBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("sampleSize ").append(sampleSize).append('\n');
@@ -176,15 +171,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
}
// iterative phase
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("2. Iterative phase...");
}
double bestObjective = Double.POSITIVE_INFINITY;
- ModifiableDBIDs m_best = null;
- ModifiableDBIDs m_bad = null;
- ModifiableDBIDs m_current = initialSet(medoids, k, random);
+ ArrayDBIDs m_best = null;
+ DBIDs m_bad = null;
+ ArrayDBIDs m_current = initialSet(medoids, k, random);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("m_c ").append(m_current).append('\n');
@@ -193,47 +188,44 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null;
- // TODO: Use DataStore and Trove for performance
- Map<DBID, PROCLUSCluster> clusters = null;
+ ArrayList<PROCLUSCluster> clusters = null;
int loops = 0;
- while (loops < 10) {
- Map<DBID, TIntSet> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery);
- clusters = assignPoints(dimensions, relation);
+ while(loops < 10) {
+ TIntSet[] dimensions = findDimensions(m_current, relation, distFunc, rangeQuery);
+ clusters = assignPoints(m_current, dimensions, relation);
double objectiveFunction = evaluateClusters(clusters, dimensions, relation);
- if (objectiveFunction < bestObjective) {
+ if(objectiveFunction < bestObjective) {
// restart counting loops
loops = 0;
bestObjective = objectiveFunction;
m_best = m_current;
- m_bad = computeBadMedoids(clusters, (int) (relation.size() * 0.1 / k));
+ m_bad = computeBadMedoids(m_current, clusters, (int) (relation.size() * 0.1 / k));
}
m_current = computeM_current(medoids, m_best, m_bad, random);
loops++;
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
}
- if (cprogress != null) {
- cprogress.setCompleted(LOG);
- }
+ LOG.setCompleted(cprogress);
// refinement phase
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("3. Refinement phase...");
}
- List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<>(clusters.values()), relation);
+ List<Pair<Vector, TIntSet>> dimensions = findDimensions(clusters, relation);
List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation);
// build result
int numClusters = 1;
- Clustering<SubspaceModel<V>> result = new Clustering<>("ProClus clustering", "proclus-clustering");
- for (PROCLUSCluster c : finalClusters) {
- Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.objectIDs);
- cluster.setModel(new SubspaceModel<>(new Subspace(c.getDimensions()), c.centroid));
+ Clustering<SubspaceModel> result = new Clustering<>("ProClus clustering", "proclus-clustering");
+ for(PROCLUSCluster c : finalClusters) {
+ Cluster<SubspaceModel> cluster = new Cluster<>(c.objectIDs);
+ cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid));
cluster.setName("cluster_" + numClusters++);
result.addToplevelCluster(cluster);
@@ -250,48 +242,64 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param random random number generator
* @return a piercing set of m medoids from the specified sample set
*/
- private ModifiableDBIDs greedy(DistanceQuery<V, DoubleDistance> distFunc, DBIDs sampleSet, int m, Random random) {
+ private ArrayDBIDs greedy(DistanceQuery<V> distFunc, DBIDs sampleSet, int m, Random random) {
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(m);
+
ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet);
- ModifiableDBIDs medoids = DBIDUtil.newHashSet();
+ DBIDArrayIter iter = s.iter();
+ DBIDVar m_i = DBIDUtil.newVar();
+ int size = s.size();
// m_1 is random point of S
- DBID m_i = s.remove(random.nextInt(s.size()));
+ final int r = random.nextInt(size);
+ s.assignVar(r, m_i);
medoids.add(m_i);
- if (LOG.isDebugging()) {
- LOG.debugFiner("medoids " + medoids);
+ if(LOG.isDebugging()) {
+ LOG.debugFiner("medoids " + medoids.toString());
}
+ // Remove m_i from candidates, by moving to the end.
+ s.swap(r, size - 1);
+ --size;
+
+ // To track the current worst element:
+ int worst = -1;
+ double worstd = Double.NEGATIVE_INFINITY;
// compute distances between each point in S and m_i
- // FIXME: don't use maps, so we can work with DBIDRef
- Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<>();
- for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) {
- DBID id = DBIDUtil.deref(iter);
- DoubleDistance dist = distFunc.distance(id, m_i);
- distances.put(id, DBIDUtil.newDistancePair(dist, id));
+ WritableDoubleDataStore distances = DataStoreUtil.makeDoubleStorage(s, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ for(iter.seek(0); iter.getOffset() < size; iter.advance()) {
+ final double dist = distFunc.distance(iter, m_i);
+ distances.putDouble(iter, dist);
+ if(dist > worstd) {
+ worstd = dist;
+ worst = iter.getOffset();
+ }
}
- for (int i = 1; i < m; i++) {
+ for(int i = 1; i < m; i++) {
// choose medoid m_i to be far from previous medoids
- List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<>(distances.values());
- DistanceDBIDResultUtil.sortByDistance(d);
-
- m_i = DBIDUtil.deref(d.get(d.size() - 1));
+ s.assignVar(worst, m_i);
medoids.add(m_i);
- s.remove(m_i);
- distances.remove(m_i);
-
- // compute distances of each point to closest medoid
- for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) {
- DBID id = DBIDUtil.deref(iter);
- DoubleDistance dist_new = distFunc.distance(id, m_i);
- DoubleDistance dist_old = distances.get(id).getDistance();
-
- DoubleDistance dist = dist_new.compareTo(dist_old) < 0 ? dist_new : dist_old;
- distances.put(id, DBIDUtil.newDistancePair(dist, id));
+ // Remove m_i from candidates, by moving to the end.
+ s.swap(worst, size - 1);
+ --size;
+
+ // compute distances of each point to closest medoid; track worst.
+ worst = -1;
+ worstd = Double.NEGATIVE_INFINITY;
+ for(iter.seek(0); iter.getOffset() < size; iter.advance()) {
+ double dist_new = distFunc.distance(iter, m_i);
+ double dist_old = distances.doubleValue(iter);
+ double dist = (dist_new < dist_old) ? dist_new : dist_old;
+ distances.putDouble(iter, dist);
+ if(dist > worstd) {
+ worstd = dist;
+ worst = iter.getOffset();
+ }
}
- if (LOG.isDebugging()) {
- LOG.debugFiner("medoids " + medoids);
+ if(LOG.isDebugging()) {
+ LOG.debugFiner("medoids " + medoids.toString());
}
}
@@ -306,14 +314,8 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param random random number generator
* @return a set of k elements from the specified sample set
*/
- private ModifiableDBIDs initialSet(DBIDs sampleSet, int k, Random random) {
- ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet);
- ModifiableDBIDs initialSet = DBIDUtil.newHashSet();
- while (initialSet.size() < k) {
- DBID next = s.remove(random.nextInt(s.size()));
- initialSet.add(next);
- }
- return initialSet;
+ private ArrayDBIDs initialSet(DBIDs sampleSet, int k, Random random) {
+ return DBIDUtil.ensureArray(DBIDUtil.randomSample(sampleSet, k, random));
}
/**
@@ -325,21 +327,21 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param random random number generator
* @return m_current, the set of medoids in current iteration
*/
- private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) {
+ private ArrayDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) {
ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m);
m_list.removeDBIDs(m_best);
- ModifiableDBIDs m_current = DBIDUtil.newHashSet();
- for (DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) {
- DBID m_i = DBIDUtil.deref(iter);
- if (m_bad.contains(m_i)) {
+ ArrayModifiableDBIDs m_current = DBIDUtil.newArray();
+ for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) {
+ if(m_bad.contains(iter)) {
int currentSize = m_current.size();
- while (m_current.size() == currentSize) {
+ while(m_current.size() == currentSize) {
DBID next = m_list.remove(random.nextInt(m_list.size()));
m_current.add(next);
}
- } else {
- m_current.add(m_i);
+ }
+ else {
+ m_current.add(iter);
}
}
@@ -357,29 +359,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param distFunc the distance function
* @return a mapping of the medoid's id to its locality
*/
- private Map<DBID, DistanceDBIDList<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) {
- Map<DBID, DistanceDBIDList<DoubleDistance>> result = new HashMap<>();
+ private DataStore<DoubleDBIDList> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) {
+ WritableDataStore<DoubleDBIDList> result = DataStoreUtil.makeStorage(medoids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, DoubleDBIDList.class);
- for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
- DBID m = DBIDUtil.deref(iter);
+ for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
// determine minimum distance between current medoid m and any other
// medoid m_i
- DoubleDistance minDist = null;
- for (DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) {
- DBID m_i = DBIDUtil.deref(iter2);
- if (DBIDUtil.equal(m_i, m)) {
+ double minDist = Double.POSITIVE_INFINITY;
+ for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) {
+ if(DBIDUtil.equal(iter, iter2)) {
continue;
}
- DoubleDistance currentDist = distFunc.distance(m, m_i);
- if (minDist == null || currentDist.compareTo(minDist) < 0) {
+ double currentDist = distFunc.distance(iter, iter2);
+ if(currentDist < minDist) {
minDist = currentDist;
}
}
// determine points in sphere centered at m with radius minDist
- assert minDist != null;
- DistanceDBIDList<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist);
- result.put(m, qr);
+ assert minDist != Double.POSITIVE_INFINITY;
+ DoubleDBIDList qr = rangeQuery.getRangeForDBID(iter, minDist);
+ result.put(iter, qr);
}
return result;
@@ -395,68 +395,67 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the set of correlated dimensions for each medoid in the specified
* medoid set
*/
- private Map<DBID, TIntSet> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) {
+ private TIntSet[] findDimensions(ArrayDBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) {
// get localities
- Map<DBID, DistanceDBIDList<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery);
+ DataStore<DoubleDBIDList> localities = getLocalities(medoids, database, distFunc, rangeQuery);
// compute x_ij = avg distance from points in l_i to medoid m_i
int dim = RelationUtil.dimensionality(database);
- Map<DBID, double[]> averageDistances = new HashMap<>();
+ double[][] averageDistances = new double[medoids.size()][];
- for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
- DBID m_i = DBIDUtil.deref(iter);
- V medoid_i = database.get(m_i);
- DistanceDBIDList<DoubleDistance> l_i = localities.get(m_i);
+ int i = 0;
+ for(DBIDArrayIter iter = medoids.iter(); iter.valid(); iter.advance(), i++) {
+ V medoid_i = database.get(iter);
+ DoubleDBIDList l_i = localities.get(iter);
double[] x_i = new double[dim];
- for (DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) {
+ for(DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) {
V o = database.get(qr);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] += Math.abs(medoid_i.doubleValue(d) - o.doubleValue(d));
}
}
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] /= l_i.size();
}
- averageDistances.put(m_i, x_i);
+ averageDistances[i] = x_i;
}
- Map<DBID, TIntSet> dimensionMap = new HashMap<>();
- List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<>();
- for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
- DBID m_i = DBIDUtil.deref(iter);
+ TIntSet[] dimensionMap = new TIntSet[medoids.size()];
+ List<DoubleIntInt> z_ijs = new ArrayList<>();
+ for(i = 0; i < medoids.size(); i++) {
TIntSet dims_i = new TIntHashSet();
- dimensionMap.put(m_i, dims_i);
+ dimensionMap[i] = dims_i;
- double[] x_i = averageDistances.get(m_i);
+ double[] x_i = averageDistances[i];
// y_i
double y_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
y_i += x_i[j];
}
y_i /= dim;
// sigma_i
double sigma_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
double diff = x_i[j] - y_i;
sigma_i += diff * diff;
}
sigma_i /= (dim - 1);
sigma_i = Math.sqrt(sigma_i);
- for (int j = 0; j < dim; j++) {
- z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, m_i, j));
+ for(int j = 0; j < dim; j++) {
+ z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j));
}
}
Collections.sort(z_ijs);
int max = Math.max(k * l, 2);
- for (int m = 0; m < max; m++) {
- CTriple<Double, DBID, Integer> z_ij = z_ijs.get(m);
- TIntSet dims_i = dimensionMap.get(z_ij.getSecond());
- dims_i.add(z_ij.getThird());
+ for(int m = 0; m < max; m++) {
+ DoubleIntInt z_ij = z_ijs.get(m);
+ TIntSet dims_i = dimensionMap[z_ij.dimi];
+ dims_i.add(z_ij.dimj);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("z_ij ").append(z_ij).append('\n');
@@ -476,64 +475,65 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the set of correlated dimensions for each specified cluster
* centroid
*/
- private List<Pair<V, TIntSet>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) {
+ private List<Pair<Vector, TIntSet>> findDimensions(ArrayList<PROCLUSCluster> clusters, Relation<V> database) {
// compute x_ij = avg distance from points in c_i to c_i.centroid
int dim = RelationUtil.dimensionality(database);
- Map<Integer, double[]> averageDistances = new HashMap<>();
+ final int numc = clusters.size();
+ double[][] averageDistances = new double[numc][];
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < numc; i++) {
PROCLUSCluster c_i = clusters.get(i);
double[] x_i = new double[dim];
- for (DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) {
V o = database.get(iter);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] += Math.abs(c_i.centroid.doubleValue(d) - o.doubleValue(d));
}
}
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] /= c_i.objectIDs.size();
}
- averageDistances.put(i, x_i);
+ averageDistances[i] = x_i;
}
- List<CTriple<Double, Integer, Integer>> z_ijs = new ArrayList<>();
- for (int i = 0; i < clusters.size(); i++) {
- double[] x_i = averageDistances.get(i);
+ List<DoubleIntInt> z_ijs = new ArrayList<>();
+ for(int i = 0; i < numc; i++) {
+ double[] x_i = averageDistances[i];
// y_i
double y_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
y_i += x_i[j];
}
y_i /= dim;
// sigma_i
double sigma_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
double diff = x_i[j] - y_i;
sigma_i += diff * diff;
}
sigma_i /= (dim - 1);
sigma_i = Math.sqrt(sigma_i);
- for (int j = 0; j < dim; j++) {
- z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, i, j));
+ for(int j = 0; j < dim; j++) {
+ z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j));
}
}
Collections.sort(z_ijs);
// mapping cluster index -> dimensions
- Map<Integer, TIntSet> dimensionMap = new HashMap<>();
+ TIntSet[] dimensionMap = new TIntSet[numc];
int max = Math.max(k * l, 2);
- for (int m = 0; m < max; m++) {
- CTriple<Double, Integer, Integer> z_ij = z_ijs.get(m);
- TIntSet dims_i = dimensionMap.get(z_ij.getSecond());
- if (dims_i == null) {
+ for(int m = 0; m < max; m++) {
+ DoubleIntInt z_ij = z_ijs.get(m);
+ TIntSet dims_i = dimensionMap[z_ij.dimi];
+ if(dims_i == null) {
dims_i = new TIntHashSet();
- dimensionMap.put(z_ij.getSecond(), dims_i);
+ dimensionMap[z_ij.dimi] = dims_i;
}
- dims_i.add(z_ij.getThird());
+ dims_i.add(z_ij.dimj);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("z_ij ").append(z_ij).append('\n');
@@ -543,9 +543,12 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
}
// mapping cluster -> dimensions
- List<Pair<V, TIntSet>> result = new ArrayList<>();
- for (int i : dimensionMap.keySet()) {
- TIntSet dims_i = dimensionMap.get(i);
+ List<Pair<Vector, TIntSet>> result = new ArrayList<>();
+ for(int i = 0; i < numc; i++) {
+ TIntSet dims_i = dimensionMap[i];
+ if(dims_i == null) {
+ continue;
+ }
PROCLUSCluster c_i = clusters.get(i);
result.add(new Pair<>(c_i.centroid, dims_i));
}
@@ -555,45 +558,51 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
/**
* Assigns the objects to the clusters.
*
+ * @param m_current Current centers
* @param dimensions set of correlated dimensions for each medoid of the
* cluster
* @param database the database containing the objects
* @return the assignments of the object to the clusters
*/
- private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, TIntSet> dimensions, Relation<V> database) {
- Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<>();
- for (DBID m_i : dimensions.keySet()) {
- clusterIDs.put(m_i, DBIDUtil.newHashSet());
- }
-
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
- DBID p_id = DBIDUtil.deref(it);
- V p = database.get(p_id);
- DistanceDBIDPair<DoubleDistance> minDist = null;
- for (DBID m_i : dimensions.keySet()) {
+ private ArrayList<PROCLUSCluster> assignPoints(ArrayDBIDs m_current, TIntSet[] dimensions, Relation<V> database) {
+ ModifiableDBIDs[] clusterIDs = new ModifiableDBIDs[dimensions.length];
+ for(int i = 0; i < m_current.size(); i++) {
+ clusterIDs[i] = DBIDUtil.newHashSet();
+ }
+
+ DBIDArrayIter m_i = m_current.iter();
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ V p = database.get(it);
+ double minDist = Double.NaN;
+ int best = -1, i = 0;
+ for(m_i.seek(0); m_i.valid(); m_i.advance(), i++) {
V m = database.get(m_i);
- DistanceDBIDPair<DoubleDistance> currentDist = DBIDUtil.newDistancePair(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i);
- if (minDist == null || currentDist.compareByDistance(minDist) < 0) {
+ double currentDist = manhattanSegmentalDistance(p, m, dimensions[i]);
+ if(!(minDist <= currentDist)) {
minDist = currentDist;
+ best = i;
}
}
// add p to cluster with mindist
- assert minDist != null;
- ModifiableDBIDs ids = clusterIDs.get(DBIDUtil.deref(minDist));
- ids.add(p_id);
+ assert best >= 0;
+ ModifiableDBIDs ids = clusterIDs[best];
+ ids.add(it);
}
- Map<DBID, PROCLUSCluster> clusters = new HashMap<>();
- for (DBID m_i : dimensions.keySet()) {
- ModifiableDBIDs objectIDs = clusterIDs.get(m_i);
- if (!objectIDs.isEmpty()) {
- TIntSet clusterDimensions = dimensions.get(m_i);
- V centroid = Centroid.make(database, objectIDs).toVector(database);
- clusters.put(m_i, new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
+ ArrayList<PROCLUSCluster> clusters = new ArrayList<>(m_current.size());
+ for(int i = 0; i < dimensions.length; i++) {
+ ModifiableDBIDs objectIDs = clusterIDs[i];
+ if(!objectIDs.isEmpty()) {
+ TIntSet clusterDimensions = dimensions[i];
+ Vector centroid = Centroid.make(database, objectIDs);
+ clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
+ }
+ else {
+ clusters.add(null);
}
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("clusters ").append(clusters).append('\n');
@@ -610,42 +619,43 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param database the database containing the objects
* @return the assignments of the object to the clusters
*/
- private List<PROCLUSCluster> finalAssignment(List<Pair<V, TIntSet>> dimensions, Relation<V> database) {
+ private List<PROCLUSCluster> finalAssignment(List<Pair<Vector, TIntSet>> dimensions, Relation<V> database) {
Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>();
- for (int i = 0; i < dimensions.size(); i++) {
+ for(int i = 0; i < dimensions.size(); i++) {
clusterIDs.put(i, DBIDUtil.newHashSet());
}
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
- DBID p_id = DBIDUtil.deref(it);
- V p = database.get(p_id);
- Pair<DoubleDistance, Integer> minDist = null;
- for (int i = 0; i < dimensions.size(); i++) {
- Pair<V, TIntSet> pair_i = dimensions.get(i);
- V c_i = pair_i.first;
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ V p = database.get(it);
+ double minDist = Double.POSITIVE_INFINITY;
+ int best = -1;
+ for(int i = 0; i < dimensions.size(); i++) {
+ Pair<Vector, TIntSet> pair_i = dimensions.get(i);
+ Vector c_i = pair_i.first;
TIntSet dimensions_i = pair_i.second;
- DoubleDistance currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i);
- if (minDist == null || currentDist.compareTo(minDist.first) < 0) {
- minDist = new Pair<>(currentDist, i);
+ double currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i);
+ if(best < 0 || currentDist < minDist) {
+ minDist = currentDist;
+ best = i;
}
}
// add p to cluster with mindist
- assert minDist != null;
- ModifiableDBIDs ids = clusterIDs.get(minDist.second);
- ids.add(p_id);
+ assert minDist >= 0.;
+ ModifiableDBIDs ids = clusterIDs.get(best);
+ ids.add(it);
}
List<PROCLUSCluster> clusters = new ArrayList<>();
- for (int i = 0; i < dimensions.size(); i++) {
+ for(int i = 0; i < dimensions.size(); i++) {
ModifiableDBIDs objectIDs = clusterIDs.get(i);
- if (!objectIDs.isEmpty()) {
+ if(!objectIDs.isEmpty()) {
TIntSet clusterDimensions = dimensions.get(i).second;
- V centroid = Centroid.make(database, objectIDs).toVector(database);
+ Vector centroid = Centroid.make(database, objectIDs);
clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
}
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("clusters ").append(clusters).append('\n');
@@ -664,14 +674,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the Manhattan segmental distance between o1 and o2 relative to the
* specified dimensions
*/
- private DoubleDistance manhattanSegmentalDistance(V o1, V o2, TIntSet dimensions) {
+ private double manhattanSegmentalDistance(NumberVector o1, NumberVector o2, TIntSet dimensions) {
double result = 0;
- for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
final int d = iter.next();
result += Math.abs(o1.doubleValue(d) - o2.doubleValue(d));
}
result /= dimensions.size();
- return new DoubleDistance(result);
+ return result;
}
/**
@@ -682,20 +692,20 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param database the database holding the objects
* @return a measure for the cluster quality
*/
- private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, TIntSet> dimensions, Relation<V> database) {
+ private double evaluateClusters(ArrayList<PROCLUSCluster> clusters, TIntSet[] dimensions, Relation<V> database) {
double result = 0;
- for (DBID m_i : clusters.keySet()) {
- PROCLUSCluster c_i = clusters.get(m_i);
- V centroid_i = c_i.centroid;
+ for(int i = 0; i < dimensions.length; i++) {
+ PROCLUSCluster c_i = clusters.get(i);
+ Vector centroid_i = c_i.centroid;
- TIntSet dims_i = dimensions.get(m_i);
+ TIntSet dims_i = dimensions[i];
double w_i = 0;
- for (TIntIterator iter = dims_i.iterator(); iter.hasNext();) {
+ for(TIntIterator iter = dims_i.iterator(); iter.hasNext();) {
final int j = iter.next();
w_i += avgDistance(centroid_i, c_i.objectIDs, database, j);
}
- w_i /= dimensions.keySet().size();
+ w_i /= dimensions.length;
result += c_i.objectIDs.size() * w_i;
}
@@ -713,9 +723,9 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the average distance of the objects to the centroid along the
* specified dimension
*/
- private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) {
+ private double avgDistance(Vector centroid, DBIDs objectIDs, Relation<V> database, int dimension) {
Mean avg = new Mean();
- for (DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) {
V o = database.get(iter);
avg.put(Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension)));
}
@@ -726,16 +736,18 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* Computes the bad medoids, where the medoid of a cluster with less than the
* specified threshold of objects is bad.
*
+ * @param m_current Current medoids
* @param clusters the clusters
* @param threshold the threshold
* @return the bad medoids
*/
- private ModifiableDBIDs computeBadMedoids(Map<DBID, PROCLUSCluster> clusters, int threshold) {
- ModifiableDBIDs badMedoids = DBIDUtil.newHashSet();
- for (DBID m_i : clusters.keySet()) {
- PROCLUSCluster c_i = clusters.get(m_i);
- if (c_i.objectIDs.size() < threshold) {
- badMedoids.add(m_i);
+ private DBIDs computeBadMedoids(ArrayDBIDs m_current, ArrayList<PROCLUSCluster> clusters, int threshold) {
+ ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(m_current.size());
+ int i = 0;
+ for(DBIDIter it = m_current.iter(); it.valid(); it.advance(), i++) {
+ PROCLUSCluster c_i = clusters.get(i);
+ if(c_i == null || c_i.objectIDs.size() < threshold) {
+ badMedoids.add(it);
}
}
return badMedoids;
@@ -752,6 +764,36 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
}
/**
+ * Simple triple.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class DoubleIntInt implements Comparable<DoubleIntInt> {
+ protected double first;
+
+ protected int dimi, dimj;
+
+ public DoubleIntInt(double first, int second, int third) {
+ this.first = first;
+ this.dimi = second;
+ this.dimj = third;
+ }
+
+ @Override
+ public int compareTo(DoubleIntInt o) {
+ if(this.first < o.first) {
+ return -1;
+ }
+ if(this.first > o.first) {
+ return +1;
+ }
+ return 0;
+ }
+ }
+
+ /**
* Encapsulates the attributes of a cluster.
*
* @apiviz.exclude
@@ -770,16 +812,16 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
/**
* The centroids of this cluster along each dimension.
*/
- V centroid;
+ Vector centroid;
/**
- * Provides a new cluster with the specified parameters.
+ * Constructor.
*
* @param objectIDs the ids of the objects belonging to this cluster
* @param dimensions the correlated dimensions of this cluster
* @param centroid the centroid of this cluster
*/
- public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, V centroid) {
+ public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, Vector centroid) {
this.objectIDs = objectIDs;
this.dimensions = dimensions;
this.centroid = centroid;
@@ -790,10 +832,11 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
StringBuilder result = new StringBuilder();
result.append("Dimensions: [");
boolean notFirst = false;
- for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
- if (notFirst) {
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ if(notFirst) {
result.append(',');
- } else {
+ }
+ else {
notFirst = true;
}
result.append(iter.next());
@@ -809,10 +852,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
*
* @return the correlated dimensions of this cluster as BitSet
*/
- public BitSet getDimensions() {
- BitSet result = new BitSet();
- for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
- result.set(iter.next());
+ public long[] getDimensions() {
+ int maxdim = 0;
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ int d = iter.next();
+ maxdim = (d > maxdim) ? d : maxdim;
+ }
+ long[] result = BitsUtil.zero(maxdim);
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ BitsUtil.setI(result, iter.next());
}
return result;
}
@@ -825,14 +873,32 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedClustering.Parameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractProjectedClustering.Parameterizer {
+ /**
+ * Parameter to specify the multiplier for the initial number of medoids,
+ * must be an integer greater than 0.
+ * <p>
+ * Default value: {@code 10}
+ * </p>
+ * <p>
+ * Key: {@code -proclus.mi}
+ * </p>
+ */
+ public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids.");
+
/**
* Parameter to specify the random generator seed.
*/
public static final OptionID SEED_ID = new OptionID("proclus.seed", "The random number generator seed.");
+ /**
+ * Multiplier for the initial number of medoids.
+ */
protected int m_i = -1;
+ /**
+ * Random generator
+ */
protected RandomFactory rnd;
@Override
@@ -843,14 +909,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
configKI(config);
configL(config);
- IntParameter m_iP = new IntParameter(M_I_ID, 10);
- m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
- if (config.grab(m_iP)) {
+ IntParameter m_iP = new IntParameter(M_I_ID, 10) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(m_iP)) {
m_i = m_iP.getValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java
index 4e670974..a61935b0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,42 +23,50 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedDBSCAN;
-import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.PreDeConCorePredicate;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.PreDeConNeighborPredicate;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.Model;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
-import de.lmu.ifi.dbs.elki.index.preprocessed.subspaceproj.PreDeConSubspaceIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
- * <p/>
* PreDeCon computes clusters of subspace preference weighted connected points.
* The algorithm searches for local subgroups of a set of feature vectors having
* a low variance along one or more (but not all) attributes.
- * </p>
- * <p/>
- * Reference: <br>
- * C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger: Density Connected Clustering
- * with Local Subspace Preferences. <br>
+ *
+ * Reference:
+ * <p>
+ * C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger:<br />
+ * Density Connected Clustering with Local Subspace Preferences.<br />
* In Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004.
* </p>
*
* @author Peer Kröger
*
- * @apiviz.uses PreDeConSubspaceIndex
+ * @apiviz.has PreDeCon.Settings
+ * @apiviz.composedOf PreDeConNeighborPredicate
+ * @apiviz.composedOf PreDeConCorePredicate
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
@Title("PreDeCon: Subspace Preference weighted Density Connected Clustering")
-@Description("PreDeCon computes clusters of subspace preference weighted connected points. " + "The algorithm searches for local subgroups of a set of feature vectors having " + "a low variance along one or more (but not all) attributes.")
-@Reference(authors = "C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density Connected Clustering with Local Subspace Preferences", booktitle = "Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004", url = "http://dx.doi.org/10.1109/ICDM.2004.10087")
-public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN<Clustering<Model>, V> {
+@Description("PreDeCon computes clusters of subspace preference weighted connected points. "//
+ + "The algorithm searches for local subgroups of a set of feature vectors having " + "a low variance along one or more (but not all) attributes.")
+@Reference(authors = "C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger", //
+title = "Density Connected Clustering with Local Subspace Preferences", //
+booktitle = "Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004", //
+url = "http://dx.doi.org/10.1109/ICDM.2004.10087")
+public class PreDeCon<V extends NumberVector> extends GeneralizedDBSCAN {
/**
* The logger for this class.
*/
@@ -67,28 +75,165 @@ public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN
/**
* Constructor.
*
- * @param epsilon Epsilon value
- * @param minpts MinPts value
- * @param distanceFunction outer distance function
- * @param lambda Lambda value
+ * @param settings PreDeCon settings.
*/
- public PreDeCon(DoubleDistance epsilon, int minpts, LocallyWeightedDistanceFunction<V> distanceFunction, int lambda) {
- super(epsilon, minpts, distanceFunction, lambda);
+ public PreDeCon(PreDeCon.Settings settings) {
+ super(new PreDeConNeighborPredicate<>(settings), new PreDeConCorePredicate(settings), false);
}
@Override
- public String getLongResultName() {
- return "PreDeCon Clustering";
+ protected Logging getLogger() {
+ return LOG;
}
- @Override
- public String getShortResultName() {
- return "predecon-clustering";
- }
+ /**
+ * Class containing all the PreDeCon settings.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Settings {
+ /**
+ * Query radius parameter epsilon.
+ */
+ public double epsilon;
- @Override
- protected Logging getLogger() {
- return LOG;
+ /**
+ * The threshold for small eigenvalues.
+ */
+ public double delta;
+
+ /**
+ * The kappa penality factor for deviations in preferred dimensions.
+ */
+ public double kappa = Parameterizer.KAPPA_DEFAULT;
+
+ /**
+ * DBSCAN Minpts parameter, aka "mu".
+ */
+ public int minpts;
+
+ /**
+ * Lambda: Maximum subspace dimensionality.
+ */
+ public int lambda = Integer.MAX_VALUE;
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Parameter Delta: maximum variance allowed
+ */
+ public static final OptionID DELTA_ID = new OptionID("predecon.delta", "A double specifying the variance threshold for small Eigenvalues.");
+
+ /**
+ * Parameter Kappa: penalty for deviations in preferred dimensions.
+ */
+ public static final OptionID KAPPA_ID = new OptionID("predecon.kappa", "Penalty factor for deviations in preferred (low-variance) dimensions.");
+
+ /**
+ * Default for kappa parameter.
+ */
+ public static final double KAPPA_DEFAULT = 20.;
+
+ /**
+ * Parameter Lambda: maximum dimensionality allowed.
+ */
+ public static final OptionID LAMBDA_ID = new OptionID("predecon.lambda", "Maximum dimensionality to consider for core points.");
+
+ /**
+ * Settings to build.
+ */
+ Settings settings;
+
+ @Override
+ public void makeOptions(Parameterization config) {
+ settings = new Settings();
+ configEpsilon(config);
+ configMinPts(config);
+ configDelta(config);
+ configKappa(config);
+ configLambda(config);
+ }
+
+ /**
+ * Configure the epsilon radius parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configEpsilon(Parameterization config) {
+ DoubleParameter epsilonP = new DoubleParameter(DBSCAN.Parameterizer.EPSILON_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(epsilonP)) {
+ settings.epsilon = epsilonP.doubleValue();
+ }
+ }
+
+ /**
+ * Configure the minPts aka "mu" parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configMinPts(Parameterization config) {
+ IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minptsP)) {
+ settings.minpts = minptsP.intValue();
+ }
+ }
+
+ /**
+ * Configure the delta parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configDelta(Parameterization config) {
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(deltaP)) {
+ settings.delta = deltaP.doubleValue();
+ }
+ }
+
+ /**
+ * Configure the kappa parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configKappa(Parameterization config) {
+ DoubleParameter kappaP = new DoubleParameter(KAPPA_ID) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ONE_DOUBLE) //
+ .setDefaultValue(KAPPA_DEFAULT);
+ if(config.grab(kappaP)) {
+ settings.kappa = kappaP.doubleValue();
+ }
+ }
+
+ /**
+ * Configure the delta parameter.
+ *
+ * @param config Parameter source
+ */
+ protected void configLambda(Parameterization config) {
+ IntParameter lambdaP = new IntParameter(LAMBDA_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT) //
+ .setOptional(true);
+ if(config.grab(lambdaP)) {
+ settings.lambda = lambdaP.intValue();
+ }
+ }
+
+ @Override
+ public Settings makeInstance() {
+ return settings;
+ }
+ }
}
/**
@@ -98,20 +243,20 @@ public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedDBSCAN.Parameterizer<V, DoubleDistance> {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * PreDeConSettings.
+ */
+ protected PreDeCon.Settings settings;
+
@Override
protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
- configInnerDistance(config);
- configEpsilon(config, innerdist);
- configMinPts(config);
- configOuterDistance(config, epsilon, minpts, PreDeConSubspaceIndex.Factory.class, innerdist);
- configLambda(config);
+ settings = config.tryInstantiate(PreDeCon.Settings.class);
}
@Override
protected PreDeCon<V> makeInstance() {
- return new PreDeCon<>(epsilon, minpts, outerdist, lambda);
+ return new PreDeCon<>(settings);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
index e6245f6e..d6cf40fd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
*/
import java.util.ArrayList;
-import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.TreeMap;
@@ -45,10 +44,10 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionSelectingSubspaceDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -56,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -69,8 +68,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* </p>
* <p>
* Reference: <br>
- * K. Kailing, H.-P. Kriegel, P. Kroeger: Density connected Subspace Clustering
- * for High Dimensional Data. <br>
+ * K. Kailing, H.-P. Kriegel, P. Kröger:<br />
+ * Density connected Subspace Clustering for High Dimensional Data<br />
* In Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004.
* </p>
*
@@ -83,9 +82,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @param <V> the type of FeatureVector handled by this Algorithm
*/
@Title("SUBCLU: Density connected Subspace Clustering")
-@Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.")
-@Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004")
-public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+@Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. "//
+ + "SUBCLU delivers for each subspace the same clusters DBSCAN would have found, "//
+ + "when applied to this subspace seperately.")
+@Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", //
+title = "Density connected Subspace Clustering for High Dimensional Data", //
+booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004", //
+url = "http://www.siam.org/meetings/sdm04/proceedings/sdm04_023.pdf")
+public class SUBCLU<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
@@ -125,12 +129,12 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
* Holds the instance of the distance function specified by
* {@link #DISTANCE_FUNCTION_ID}.
*/
- private DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction;
+ private DimensionSelectingSubspaceDistanceFunction<V> distanceFunction;
/**
* Holds the value of {@link #EPSILON_ID}.
*/
- private DoubleDistance epsilon;
+ private double epsilon;
/**
* Holds the value of {@link #MINPTS_ID}.
@@ -140,7 +144,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
/**
* Holds the result;
*/
- private Clustering<SubspaceModel<V>> result;
+ private Clustering<SubspaceModel> result;
/**
* Constructor.
@@ -149,7 +153,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
* @param epsilon Epsilon value
* @param minpts Minpts value
*/
- public SUBCLU(DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction, DoubleDistance epsilon, int minpts) {
+ public SUBCLU(DimensionSelectingSubspaceDistanceFunction<V> distanceFunction, double epsilon, int minpts) {
super();
this.distanceFunction = distanceFunction;
this.epsilon = epsilon;
@@ -162,15 +166,13 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
* @param relation Relation to process
* @return Clustering result
*/
- public Clustering<SubspaceModel<V>> run(Relation<V> relation) {
+ public Clustering<SubspaceModel> run(Relation<V> relation) {
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null;
// Generate all 1-dimensional clusters
- if (stepprog != null) {
- stepprog.beginStep(1, "Generate all 1-dimensional clusters.", LOG);
- }
+ LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters.");
// mapping of dimensionality to set of subspaces
HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>();
@@ -182,35 +184,35 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
// mapping of subspaces to list of clusters
TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator());
- for (int d = 0; d < dimensionality; d++) {
+ for(int d = 0; d < dimensionality; d++) {
Subspace currentSubspace = new Subspace(d);
List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace);
- if (LOG.isDebuggingFiner()) {
+ if(LOG.isDebuggingFiner()) {
StringBuilder msg = new StringBuilder();
msg.append('\n').append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n");
- for (Cluster<Model> cluster : clusters) {
+ for(Cluster<Model> cluster : clusters) {
msg.append(" " + cluster.getIDs() + "\n");
}
LOG.debugFiner(msg.toString());
}
- if (!clusters.isEmpty()) {
+ if(!clusters.isEmpty()) {
s_1.add(currentSubspace);
clusterMap.put(currentSubspace, clusters);
}
}
// Generate (d+1)-dimensional clusters from d-dimensional clusters
- for (int d = 0; d < dimensionality - 1; d++) {
- if (stepprog != null) {
+ for(int d = 0; d < dimensionality - 1; d++) {
+ if(stepprog != null) {
stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG);
}
List<Subspace> subspaces = subspaceMap.get(d);
- if (subspaces == null || subspaces.isEmpty()) {
- if (stepprog != null) {
- for (int dim = d + 1; dim < dimensionality - 1; dim++) {
+ if(subspaces == null || subspaces.isEmpty()) {
+ if(stepprog != null) {
+ for(int dim = d + 1; dim < dimensionality - 1; dim++) {
stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG);
}
}
@@ -220,37 +222,37 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
List<Subspace> candidates = generateSubspaceCandidates(subspaces);
List<Subspace> s_d = new ArrayList<>();
- for (Subspace candidate : candidates) {
+ for(Subspace candidate : candidates) {
Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap);
- if (LOG.isDebuggingFine()) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString());
}
List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace);
List<Cluster<Model>> clusters = new ArrayList<>();
- for (Cluster<Model> cluster : bestSubspaceClusters) {
+ for(Cluster<Model> cluster : bestSubspaceClusters) {
List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate);
- if (!candidateClusters.isEmpty()) {
+ if(!candidateClusters.isEmpty()) {
clusters.addAll(candidateClusters);
}
}
- if (LOG.isDebuggingFine()) {
+ if(LOG.isDebuggingFine()) {
StringBuilder msg = new StringBuilder();
msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n");
- for (Cluster<Model> c : clusters) {
+ for(Cluster<Model> c : clusters) {
msg.append(" " + c.getIDs() + "\n");
}
LOG.debugFine(msg.toString());
}
- if (!clusters.isEmpty()) {
+ if(!clusters.isEmpty()) {
s_d.add(candidate);
clusterMap.put(candidate, clusters);
}
}
- if (!s_d.isEmpty()) {
+ if(!s_d.isEmpty()) {
subspaceMap.put(d + 1, s_d);
}
}
@@ -258,19 +260,17 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
// build result
int numClusters = 1;
result = new Clustering<>("SUBCLU clustering", "subclu-clustering");
- for (Subspace subspace : clusterMap.descendingKeySet()) {
+ for(Subspace subspace : clusterMap.descendingKeySet()) {
List<Cluster<Model>> clusters = clusterMap.get(subspace);
- for (Cluster<Model> cluster : clusters) {
- Cluster<SubspaceModel<V>> newCluster = new Cluster<>(cluster.getIDs());
- newCluster.setModel(new SubspaceModel<>(subspace, Centroid.make(relation, cluster.getIDs()).toVector(relation)));
+ for(Cluster<Model> cluster : clusters) {
+ Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs());
+ newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs())));
newCluster.setName("cluster_" + numClusters++);
result.addToplevelCluster(newCluster);
}
}
- if (stepprog != null) {
- stepprog.setCompleted(LOG);
- }
+ LOG.setCompleted(stepprog);
return result;
}
@@ -279,7 +279,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
*
* @return the result of the algorithm
*/
- public Clustering<SubspaceModel<V>> getResult() {
+ public Clustering<SubspaceModel> getResult() {
return result;
}
@@ -300,7 +300,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
distanceFunction.setSelectedDimensions(subspace.getDimensions());
ProxyDatabase proxy;
- if (ids == null) {
+ if(ids == null) {
// TODO: in this case, we might want to use an index - the proxy below
// will prevent this!
ids = relation.getDBIDs();
@@ -308,9 +308,9 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
proxy = new ProxyDatabase(ids, relation);
- DBSCAN<V, DoubleDistance> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
+ DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts);
// run DBSCAN
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
}
Clustering<Model> dbsres = dbscan.run(proxy);
@@ -318,8 +318,8 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
// separate cluster and noise
List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
List<Cluster<Model>> clusters = new ArrayList<>();
- for (Cluster<Model> c : clusterAndNoise) {
- if (!c.isNoise()) {
+ for(Cluster<Model> c : clusterAndNoise) {
+ if(!c.isNoise()) {
clusters.add(c);
}
}
@@ -336,7 +336,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) {
List<Subspace> candidates = new ArrayList<>();
- if (subspaces.isEmpty()) {
+ if(subspaces.isEmpty()) {
return candidates;
}
@@ -344,46 +344,46 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
int d = subspaces.get(0).dimensionality();
StringBuilder msgFine = new StringBuilder("\n");
- if (LOG.isDebuggingFiner()) {
+ if(LOG.isDebuggingFiner()) {
msgFine.append("subspaces ").append(subspaces).append('\n');
}
- for (int i = 0; i < subspaces.size(); i++) {
+ for(int i = 0; i < subspaces.size(); i++) {
Subspace s1 = subspaces.get(i);
- for (int j = i + 1; j < subspaces.size(); j++) {
+ for(int j = i + 1; j < subspaces.size(); j++) {
Subspace s2 = subspaces.get(j);
Subspace candidate = s1.join(s2);
- if (candidate != null) {
- if (LOG.isDebuggingFiner()) {
+ if(candidate != null) {
+ if(LOG.isDebuggingFiner()) {
msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n');
}
// prune irrelevant candidate subspaces
List<Subspace> lowerSubspaces = lowerSubspaces(candidate);
- if (LOG.isDebuggingFiner()) {
+ if(LOG.isDebuggingFiner()) {
msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n');
}
boolean irrelevantCandidate = false;
- for (Subspace s : lowerSubspaces) {
- if (!subspaces.contains(s)) {
+ for(Subspace s : lowerSubspaces) {
+ if(!subspaces.contains(s)) {
irrelevantCandidate = true;
break;
}
}
- if (!irrelevantCandidate) {
+ if(!irrelevantCandidate) {
candidates.add(candidate);
}
}
}
}
- if (LOG.isDebuggingFiner()) {
+ if(LOG.isDebuggingFiner()) {
LOG.debugFiner(msgFine.toString());
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append(d + 1).append("-dimensional candidate subspaces: ");
- for (Subspace candidate : candidates) {
+ for(Subspace candidate : candidates) {
msg.append(candidate.dimensonsToString()).append(' ');
}
LOG.debug(msg.toString());
@@ -401,16 +401,16 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
*/
private List<Subspace> lowerSubspaces(Subspace subspace) {
int dimensionality = subspace.dimensionality();
- if (dimensionality <= 1) {
+ if(dimensionality <= 1) {
return null;
}
// order result according to the dimensions
List<Subspace> result = new ArrayList<>();
- BitSet dimensions = subspace.getDimensions();
- for (int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) {
- BitSet newDimensions = (BitSet) dimensions.clone();
- newDimensions.set(dim, false);
+ long[] dimensions = subspace.getDimensions();
+ for(int dim = BitsUtil.nextSetBit(dimensions, 0); dim >= 0; dim = BitsUtil.nextSetBit(dimensions, dim + 1)) {
+ long[] newDimensions = dimensions.clone();
+ BitsUtil.clearI(newDimensions, dim);
result.add(new Subspace(newDimensions));
}
@@ -432,14 +432,14 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
private Subspace bestSubspace(List<Subspace> subspaces, Subspace candidate, TreeMap<Subspace, List<Cluster<Model>>> clusterMap) {
Subspace bestSubspace = null;
- for (Subspace subspace : subspaces) {
+ for(Subspace subspace : subspaces) {
int min = Integer.MAX_VALUE;
- if (subspace.isSubspace(candidate)) {
+ if(subspace.isSubspace(candidate)) {
List<Cluster<Model>> clusters = clusterMap.get(subspace);
- for (Cluster<Model> cluster : clusters) {
+ for(Cluster<Model> cluster : clusters) {
int clusterSize = cluster.size();
- if (clusterSize < min) {
+ if(clusterSize < min) {
min = clusterSize;
bestSubspace = subspace;
}
@@ -467,29 +467,29 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
protected int minpts = 0;
- protected DoubleDistance epsilon = null;
+ protected double epsilon;
- protected DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distance = null;
+ protected DimensionSelectingSubspaceDistanceFunction<V> distance = null;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance>> param = new ObjectParameter<>(DISTANCE_FUNCTION_ID, DimensionSelectingSubspaceDistanceFunction.class, SubspaceEuclideanDistanceFunction.class);
- if (config.grab(param)) {
+ ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V>> param = new ObjectParameter<>(DISTANCE_FUNCTION_ID, DimensionSelectingSubspaceDistanceFunction.class, SubspaceEuclideanDistanceFunction.class);
+ if(config.grab(param)) {
distance = param.instantiateClass(config);
}
- DistanceParameter<DoubleDistance> epsilonP = new DistanceParameter<>(EPSILON_ID, distance);
- if (config.grab(epsilonP)) {
+ DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID);
+ if(config.grab(epsilonP)) {
epsilon = epsilonP.getValue();
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
- if (config.grab(minptsP)) {
+ if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java
index 561816bd..c76c22b8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -34,6 +34,6 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
*
* @param <M> Model type
*/
-public interface SubspaceClusteringAlgorithm<M extends SubspaceModel<?>> extends ClusteringAlgorithm<Clustering<M>> {
+public interface SubspaceClusteringAlgorithm<M extends SubspaceModel> extends ClusteringAlgorithm<Clustering<M>> {
// No additional constraints
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java
new file mode 100644
index 00000000..42f1eaa2
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java
@@ -0,0 +1,132 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+
+/**
+ * Represents an interval in a certain dimension of the data space.
+ *
+ * @author Elke Achtert
+ */
+public class CLIQUEInterval implements Comparable<CLIQUEInterval> {
+ /**
+ * The dimension of this interval in the (original) data space.
+ */
+ private int dimension;
+
+ /**
+ * The minimum (left) value of this interval.
+ */
+ private double min;
+
+ /**
+ * The maximum (right) value of this interval.
+ */
+ private double max;
+
+ /**
+ * Creates a new interval with the specified parameters.
+ *
+ * @param dimension the dimension of the interval in the original data space
+ * @param min the minimum (left) value of the interval
+ * @param max the maximum (right) value of the interval
+ */
+ public CLIQUEInterval(int dimension, double min, double max) {
+ this.dimension = dimension;
+ this.min = min;
+ this.max = max;
+ }
+
+ /**
+ * Returns the dimension of the interval in the original data space
+ *
+ * @return the dimension of the interval in the original data space
+ */
+ public int getDimension() {
+ return dimension;
+ }
+
+ /**
+ * Returns the minimum (left) value of the interval.
+ *
+ * @return the minimum (left) value of the interval
+ */
+ public double getMin() {
+ return min;
+ }
+
+ /**
+ * Returns the maximum (right) value of the interval.
+ *
+ * @return the maximum (right) value of the interval
+ */
+ public double getMax() {
+ return max;
+ }
+
+ /**
+ * Returns a string representation of this interval. The string representation
+ * consists of the dimension and the min and max values of this interval.
+ *
+ * @return a string representation of this interval
+ */
+ @Override
+ public String toString() {
+ return "d" + (dimension + 1) + "-[" + FormatUtil.NF2.format(min) + "; " + FormatUtil.NF2.format(max) + "[";
+ }
+
+ /**
+ * Compares this interval with the specified interval for order. Returns a
+ * negative integer, zero, or a positive integer as this interval is less
+ * than, equal to, or greater than the specified interval. First the
+ * dimensions of the intervals are compared. In case of equality the min
+ * (left) values are compared.
+ *
+ * @param other the interval to be compared
+ * @return a negative integer, zero, or a positive integer as this object is
+ * less than, equal to, or greater than the specified object.
+ */
+ @Override
+ public int compareTo(CLIQUEInterval other) {
+ if(dimension < other.dimension) {
+ return -1;
+ }
+ if(dimension > other.dimension) {
+ return 1;
+ }
+
+ if(min < other.min) {
+ return -1;
+ }
+ if(min > other.min) {
+ return 1;
+ }
+
+ if(max != other.max) {
+ throw new RuntimeException("Should never happen!");
+ }
+ return 0;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java
index 50e3fcd5..4df3e2a5 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -24,16 +24,15 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique;
*/
import java.util.ArrayList;
-import java.util.BitSet;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
-import de.lmu.ifi.dbs.elki.data.Interval;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.Subspace;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
/**
@@ -46,7 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
*
* @param <V> the type of NumberVector this subspace contains
*/
-public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
+public class CLIQUESubspace<V extends NumberVector> extends Subspace {
/**
* The dense units belonging to this subspace.
*/
@@ -74,7 +73,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
*
* @param dimensions the dimensions building this subspace
*/
- public CLIQUESubspace(BitSet dimensions) {
+ public CLIQUESubspace(long[] dimensions) {
super(dimensions);
denseUnits = new ArrayList<>();
coverage = 0;
@@ -86,9 +85,9 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
* @param unit the unit to be added.
*/
public void addDenseUnit(CLIQUEUnit<V> unit) {
- Collection<Interval> intervals = unit.getIntervals();
- for(Interval interval : intervals) {
- if(!getDimensions().get(interval.getDimension())) {
+ Collection<CLIQUEInterval> intervals = unit.getIntervals();
+ for(CLIQUEInterval interval : intervals) {
+ if(!BitsUtil.get(getDimensions(), interval.getDimension())) {
throw new IllegalArgumentException("Unit " + unit + "cannot be added to this subspace, because of wrong dimensions!");
}
}
@@ -131,7 +130,8 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
unit.markAsAssigned();
model.addDenseUnit(unit);
- for(int dim = getDimensions().nextSetBit(0); dim >= 0; dim = getDimensions().nextSetBit(dim + 1)) {
+ final long[] dims = getDimensions();
+ for(int dim = BitsUtil.nextSetBit(dims, 0); dim >= 0; dim = BitsUtil.nextSetBit(dims, dim + 1)) {
CLIQUEUnit<V> left = leftNeighbor(unit, dim);
if(left != null && !left.isAssigned()) {
dfs(left, cluster, model);
@@ -152,7 +152,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
* @return the left neighbor of the given unit in the specified dimension
*/
public CLIQUEUnit<V> leftNeighbor(CLIQUEUnit<V> unit, int dim) {
- Interval i = unit.getInterval(Integer.valueOf(dim));
+ CLIQUEInterval i = unit.getInterval(dim);
for(CLIQUEUnit<V> u : getDenseUnits()) {
if(u.containsLeftNeighbor(i)) {
@@ -170,7 +170,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
* @return the right neighbor of the given unit in the specified dimension
*/
public CLIQUEUnit<V> rightNeighbor(CLIQUEUnit<V> unit, Integer dim) {
- Interval i = unit.getInterval(dim);
+ CLIQUEInterval i = unit.getInterval(dim);
for(CLIQUEUnit<V> u : getDenseUnits()) {
if(u.containsRightNeighbor(i)) {
@@ -212,7 +212,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace {
* @see de.lmu.ifi.dbs.elki.data.Subspace#joinLastDimensions
*/
public CLIQUESubspace<V> join(CLIQUESubspace<V> other, double all, double tau) {
- BitSet dimensions = joinLastDimensions(other);
+ long[] dimensions = joinLastDimensions(other);
if(dimensions == null) {
return null;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java
index a71b2b67..8e9f139c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -29,7 +29,6 @@ import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;
-import de.lmu.ifi.dbs.elki.data.Interval;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
@@ -42,21 +41,22 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
*
* @author Elke Achtert
*
+ * @apiviz.composedOf CLIQUEInterval
* @apiviz.composedOf ModifiableDBIDs
*
* @param <V> the type of NumberVector this unit contains
*/
-public class CLIQUEUnit<V extends NumberVector<?>> {
+public class CLIQUEUnit<V extends NumberVector> {
/**
* The one-dimensional intervals of which this unit is build.
*/
- private SortedSet<Interval> intervals;
+ private SortedSet<CLIQUEInterval> intervals;
/**
- * Provides a mapping of particular dimensions to the intervals of which this
- * unit is build.
+ * Mapping of particular dimensions to the intervals of which this unit is
+ * build.
*/
- private TIntObjectHashMap<Interval> dimensionToInterval;
+ private TIntObjectHashMap<CLIQUEInterval> dimensionToInterval;
/**
* The ids of the feature vectors this unit contains.
@@ -74,11 +74,11 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
* @param intervals the intervals belonging to this unit
* @param ids the ids of the feature vectors belonging to this unit
*/
- public CLIQUEUnit(SortedSet<Interval> intervals, ModifiableDBIDs ids) {
+ public CLIQUEUnit(SortedSet<CLIQUEInterval> intervals, ModifiableDBIDs ids) {
this.intervals = intervals;
dimensionToInterval = new TIntObjectHashMap<>();
- for(Interval interval : intervals) {
+ for(CLIQUEInterval interval : intervals) {
dimensionToInterval.put(interval.getDimension(), interval);
}
@@ -92,7 +92,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
*
* @param interval the interval belonging to this unit
*/
- public CLIQUEUnit(Interval interval) {
+ public CLIQUEUnit(CLIQUEInterval interval) {
intervals = new TreeSet<>();
intervals.add(interval);
@@ -113,7 +113,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
* vector, false otherwise
*/
public boolean contains(V vector) {
- for(Interval interval : intervals) {
+ for(CLIQUEInterval interval : intervals) {
final double value = vector.doubleValue(interval.getDimension());
if(interval.getMin() > value || value >= interval.getMax()) {
return false;
@@ -164,7 +164,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
*
* @return a sorted set of the intervals of which this unit is build
*/
- public SortedSet<Interval> getIntervals() {
+ public SortedSet<CLIQUEInterval> getIntervals() {
return intervals;
}
@@ -174,7 +174,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
* @param dimension the dimension of the interval to be returned
* @return the interval of the specified dimension
*/
- public Interval getInterval(Integer dimension) {
+ public CLIQUEInterval getInterval(int dimension) {
return dimensionToInterval.get(dimension);
}
@@ -186,8 +186,8 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
* @return true if this unit contains the left neighbor of the specified
* interval, false otherwise
*/
- public boolean containsLeftNeighbor(Interval i) {
- Interval interval = dimensionToInterval.get(i.getDimension());
+ public boolean containsLeftNeighbor(CLIQUEInterval i) {
+ CLIQUEInterval interval = dimensionToInterval.get(i.getDimension());
if(interval == null) {
return false;
}
@@ -202,8 +202,8 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
* @return true if this unit contains the right neighbor of the specified
* interval, false otherwise
*/
- public boolean containsRightNeighbor(Interval i) {
- Interval interval = dimensionToInterval.get(i.getDimension());
+ public boolean containsRightNeighbor(CLIQUEInterval i) {
+ CLIQUEInterval interval = dimensionToInterval.get(i.getDimension());
if(interval == null) {
return false;
}
@@ -246,15 +246,15 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
* greater than tau, null otherwise
*/
public CLIQUEUnit<V> join(CLIQUEUnit<V> other, double all, double tau) {
- Interval i1 = this.intervals.last();
- Interval i2 = other.intervals.last();
+ CLIQUEInterval i1 = this.intervals.last();
+ CLIQUEInterval i2 = other.intervals.last();
if(i1.getDimension() >= i2.getDimension()) {
return null;
}
- Iterator<Interval> it1 = this.intervals.iterator();
- Iterator<Interval> it2 = other.intervals.iterator();
- SortedSet<Interval> resultIntervals = new TreeSet<>();
+ Iterator<CLIQUEInterval> it1 = this.intervals.iterator();
+ Iterator<CLIQUEInterval> it2 = other.intervals.iterator();
+ SortedSet<CLIQUEInterval> resultIntervals = new TreeSet<>();
for(int i = 0; i < this.intervals.size() - 1; i++) {
i1 = it1.next();
i2 = it2.next();
@@ -285,7 +285,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> {
@Override
public String toString() {
StringBuilder result = new StringBuilder();
- for(Interval interval : intervals) {
+ for(CLIQUEInterval interval : intervals) {
result.append(interval).append(' ');
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java
index 7acd7572..87f435b4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java
@@ -7,7 +7,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
+Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java
index 2efa038d..6247de84 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java
@@ -5,27 +5,30 @@
* subspace clustering algorithms according to the classical but somewhat obsolete classification schema
* of clustering algorithms for axis-parallel subspaces.
*
+ * @apiviz.exclude ^de.lmu.ifi.dbs.elki.algorithm.Abstract
+ * @apiviz.exclude ^de.lmu.ifi.dbs.elki.algorithm\.clustering\.subspace\.P3C\.(ClusterCandidate|Signature)
+ * @apiviz.exclude ^de.lmu.ifi.dbs.elki.algorithm\.clustering\.subspace\.clique\.
*/
/*
-This file is part of ELKI:
-Environment for Developing KDD-Applications Supported by Index-Structures
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2013
-Ludwig-Maximilians-Universität München
-Lehr- und Forschungseinheit für Datenbanksysteme
-ELKI Development Team
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
-You should have received a copy of the GNU Affero General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; \ No newline at end of file