diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace')
14 files changed, 1560 insertions, 881 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index 617d74cd..f55ec964 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -35,9 +35,9 @@ import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEUnit; +import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEInterval; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.Interval; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; @@ -95,7 +95,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications") @Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.") @Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314") -public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class CLIQUE<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** * The logger for this class. */ @@ -165,7 +165,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * @param relation Data relation to process * @return Clustering result */ - public Clustering<SubspaceModel<V>> run(Relation<V> relation) { + public Clustering<SubspaceModel> run(Relation<V> relation) { // 1. Identification of subspaces that contain clusters // TODO: use step logging. if(LOG.isVerbose()) { @@ -203,7 +203,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } // build result int numClusters = 1; - Clustering<SubspaceModel<V>> result = new Clustering<>("CLIQUE clustering", "clique-clustering"); + Clustering<SubspaceModel> result = new Clustering<>("CLIQUE clustering", "clique-clustering"); for(Integer dim : dimensionToDenseSubspaces.keySet()) { List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim); List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces); @@ -213,8 +213,8 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } for(Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) { - Cluster<SubspaceModel<V>> newCluster = new Cluster<>(modelAndCluster.second); - newCluster.setModel(new SubspaceModel<>(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).toVector(relation))); + Cluster<SubspaceModel> newCluster = new Cluster<>(modelAndCluster.second); + newCluster.setModel(new SubspaceModel(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second))); newCluster.setName("cluster_" + numClusters++); result.addToplevelCluster(newCluster); } @@ -313,9 +313,9 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster if(LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); - msg.append(" minima: ").append(FormatUtil.format(minima, ", ", 2)); - msg.append("\n maxima: ").append(FormatUtil.format(maxima, ", ", 2)); - msg.append("\n unit lengths: ").append(FormatUtil.format(unit_lengths, ", ", 2)); + msg.append(" minima: ").append(FormatUtil.format(minima, ", ", FormatUtil.NF2)); + msg.append("\n maxima: ").append(FormatUtil.format(maxima, ", ", FormatUtil.NF2)); + msg.append("\n unit lengths: ").append(FormatUtil.format(unit_lengths, ", ", FormatUtil.NF2)); LOG.debugFiner(msg.toString()); } @@ -341,7 +341,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster List<CLIQUEUnit<V>> units = new ArrayList<>((xsi * dimensionality)); for(int x = 0; x < xsi; x++) { for(int d = 0; d < dimensionality; d++) { - units.add(new CLIQUEUnit<V>(new Interval(d, unit_bounds[x][d], unit_bounds[x + 1][d]))); + units.add(new CLIQUEUnit<V>(new CLIQUEInterval(d, unit_bounds[x][d], unit_bounds[x + 1][d]))); } } @@ -582,7 +582,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { protected int xsi; protected double tau; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java index 5f798a66..fa8bbbe4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team @@ -23,7 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.BitSet;
import java.util.Random;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
@@ -47,12 +46,12 @@ import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceMaximumDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -66,14 +65,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; /**
* <p>
- * Provides the DOC algorithm, and it's heuristic variant, FastDOC. DOC is a
- * sampling based subspace clustering algorithm.
+ * The DOC algorithm, and it's heuristic variant, FastDOC. DOC is a sampling
+ * based subspace clustering algorithm.
* </p>
*
* <p>
* Reference: <br/>
* C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali<br />
- * A Monte Carlo algorithm for fast projective clustering. <br/>
+ * A Monte Carlo algorithm for fast projective clustering. <br />
* In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02).
* </p>
*
@@ -84,8 +83,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; * @param <V> the type of NumberVector handled by this Algorithm.
*/
@Title("DOC: Density-based Optimal projective Clustering")
-@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", title = "A Monte Carlo algorithm for fast projective clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", url = "http://dx.doi.org/10.1145/564691.564739")
-public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", //
+title = "A Monte Carlo algorithm for fast projective clustering", //
+booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", //
+url = "http://dx.doi.org/10.1145/564691.564739")
+public class DOC<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
@@ -152,7 +154,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param database Database
* @param relation Data relation
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
// Dimensionality of our set.
final int d = RelationUtil.dimensionality(relation);
@@ -173,7 +175,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering int minClusterSize = (int) (alpha * S.size());
// List of all clusters we found.
- Clustering<SubspaceModel<V>> result = new Clustering<>("DOC Clusters", "DOC");
+ Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");
// Inform the user about the number of actual clusters found so far.
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
@@ -181,7 +183,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering // To not only find a single cluster, we continue running until our set
// of points is empty.
while(S.size() > minClusterSize) {
- Cluster<SubspaceModel<V>> C;
+ Cluster<SubspaceModel> C;
if(heuristics) {
C = runFastDOC(relation, S, d, n, m, (int) r);
}
@@ -206,15 +208,10 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering // Add the remainder as noise.
if(S.size() > 0) {
- BitSet alldims = new BitSet();
- alldims.set(0, d);
- result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel<>(new Subspace(alldims), Centroid.make(relation, S).toVector(relation))));
+ long[] alldims = BitsUtil.ones(d);
+ result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S))));
}
-
- if(cprogress != null) {
- cprogress.setCompleted(LOG);
- }
-
+ LOG.setCompleted(cprogress);
return result;
}
@@ -230,12 +227,11 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param minClusterSize Minimum size a cluster must have to be accepted.
* @return a cluster, if one is found, else <code>null</code>.
*/
- private Cluster<SubspaceModel<V>> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
- final DoubleDistance wd = new DoubleDistance(w);
+ private Cluster<SubspaceModel> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
// Best cluster for the current run.
DBIDs C = null;
// Relevant attributes for the best cluster.
- BitSet D = null;
+ long[] D = null;
// Quality of the best cluster.
double quality = Double.NEGATIVE_INFINITY;
@@ -244,9 +240,9 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering // double[d], new double[d]);
// Weights for distance (= rectangle query)
- SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(new BitSet(d));
- DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
- RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq);
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(BitsUtil.zero(d));
+ DistanceQuery<V> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V> rq = relation.getDatabase().getRangeQuery(dq);
// Inform the user about the progress in the current iteration.
FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
@@ -263,22 +259,22 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
// Initialize cluster info.
- BitSet nD = new BitSet(d);
+ long[] nD = BitsUtil.zero(d);
// Test each dimension and build bounding box.
for(int k = 0; k < d; ++k) {
if(dimensionIsRelevant(k, relation, randomSet)) {
- nD.set(k);
+ BitsUtil.setI(nD, k);
}
}
- if(nD.cardinality() > 0) {
+ if(BitsUtil.cardinality(nD) > 0) {
// Get all points in the box.
df.setSelectedDimensions(nD);
// TODO: add filtering capabilities into query API!
- DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, wd));
+ DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, w));
if(LOG.isDebuggingFiner()) {
- LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + nD.cardinality());
+ LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + BitsUtil.cardinality(nD));
}
// Is the cluster large enough?
@@ -290,7 +286,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
else {
// Better cluster than before?
- double nQuality = computeClusterQuality(nC.size(), nD.cardinality());
+ double nQuality = computeClusterQuality(nC.size(), BitsUtil.cardinality(nD));
if(nQuality > quality) {
if(LOG.isDebuggingFiner()) {
LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
@@ -306,23 +302,12 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
}
}
-
- if(iprogress != null) {
- iprogress.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(iprogress);
}
}
+ LOG.ensureCompleted(iprogress);
- if(iprogress != null) {
- iprogress.ensureCompleted(LOG);
- }
-
- if(C != null) {
- return makeCluster(relation, C, D);
- }
- else {
- return null;
- }
+ return (C != null) ? makeCluster(relation, C, D) : null;
}
/**
@@ -336,9 +321,9 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param n Number of outer iterations (seed points).
* @return a cluster, if one is found, else <code>null</code>.
*/
- private Cluster<SubspaceModel<V>> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
+ private Cluster<SubspaceModel> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
// Relevant attributes of highest cardinality.
- BitSet D = null;
+ long[] D = null;
// The seed point for the best dimensions.
DBIDVar dV = DBIDUtil.newVar();
@@ -357,57 +342,46 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
// Initialize cluster info.
- BitSet nD = new BitSet(d);
+ long[] nD = BitsUtil.zero(d);
// Test each dimension.
for(int k = 0; k < d; ++k) {
if(dimensionIsRelevant(k, relation, randomSet)) {
- nD.set(k);
+ BitsUtil.setI(nD, k);
}
}
- if(D == null || nD.cardinality() > D.cardinality()) {
+ if(D == null || BitsUtil.cardinality(nD) > BitsUtil.cardinality(D)) {
D = nD;
dV.set(iter);
- if(D.cardinality() >= d_zero) {
+ if(BitsUtil.cardinality(D) >= d_zero) {
if(iprogress != null) {
iprogress.setProcessed(iprogress.getTotal(), LOG);
}
break outer;
}
}
-
- if(iprogress != null) {
- iprogress.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(iprogress);
}
}
-
- if(iprogress != null) {
- iprogress.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(iprogress);
// If no relevant dimensions were found, skip it.
- if(D == null || D.cardinality() == 0) {
+ if(D == null || BitsUtil.cardinality(D) == 0) {
return null;
}
// Get all points in the box.
SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
- DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
- RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
+ DistanceQuery<V> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
// TODO: add filtering capabilities into query API!
- DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, new DoubleDistance(w)));
+ DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, w));
// If we have a non-empty cluster, return it.
- if(C.size() > 0) {
- return makeCluster(relation, C, D);
- }
- else {
- return null;
- }
+ return (C.size() > 0) ? makeCluster(relation, C, D) : null;
}
/**
@@ -421,12 +395,11 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @return <code>true</code> if the dimension is relevant.
*/
private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
- double min = Double.POSITIVE_INFINITY;
- double max = Double.NEGATIVE_INFINITY;
+ double min = Double.POSITIVE_INFINITY, max = Double.NEGATIVE_INFINITY;
for(DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
- V xV = relation.get(iter);
- min = Math.min(min, xV.doubleValue(dimension));
- max = Math.max(max, xV.doubleValue(dimension));
+ double xV = relation.get(iter).doubleValue(dimension);
+ min = (xV < min) ? xV : min;
+ max = (xV > max) ? xV : max;
if(max - min > w) {
return false;
}
@@ -443,10 +416,10 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param D the relevant dimensions.
* @return an object representing the subspace cluster.
*/
- private Cluster<SubspaceModel<V>> makeCluster(Relation<V> relation, DBIDs C, BitSet D) {
+ private Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
- Cluster<SubspaceModel<V>> cluster = new Cluster<>(ids);
- cluster.setModel(new SubspaceModel<>(new Subspace(D), Centroid.make(relation, ids).toVector(relation)));
+ Cluster<SubspaceModel> cluster = new Cluster<>(ids);
+ cluster.setModel(new SubspaceModel(new Subspace(D), Centroid.make(relation, ids)));
return cluster;
}
@@ -483,7 +456,7 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering *
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Relative density threshold parameter Alpha.
*/
@@ -549,26 +522,26 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering super.makeOptions(config);
{
- DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2);
- param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
- param.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE) //
+ .addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
if(config.grab(param)) {
alpha = param.getValue();
}
}
{
- DoubleParameter param = new DoubleParameter(BETA_ID, 0.8);
- param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
- param.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ DoubleParameter param = new DoubleParameter(BETA_ID, 0.8) //
+ .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE) //
+ .addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
if(config.grab(param)) {
beta = param.getValue();
}
}
{
- DoubleParameter param = new DoubleParameter(W_ID, 0.05);
- param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ DoubleParameter param = new DoubleParameter(W_ID, 0.05) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
if(config.grab(param)) {
w = param.getValue();
}
@@ -582,8 +555,8 @@ public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
if(heuristics) {
- IntParameter param = new IntParameter(D_ZERO_ID, 5);
- param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ IntParameter param = new IntParameter(D_ZERO_ID, 5) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(param)) {
d_zero = param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index cd5e51b8..20849dd6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,8 +23,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.map.hash.TCustomHashMap; +import gnu.trove.procedure.TObjectObjectProcedure; + import java.util.ArrayList; -import java.util.BitSet; import java.util.Collection; import java.util.Collections; import java.util.Comparator; @@ -34,7 +36,9 @@ import java.util.List; import java.util.Map; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS; +import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.ClusterOrderResult; +import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.CorrelationClusterOrderEntry; +import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.GeneralizedOPTICS; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -42,26 +46,20 @@ import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DiSHDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.AbstractDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelationDistance; import de.lmu.ifi.dbs.elki.index.preprocessed.preference.DiSHPreferenceVectorIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid; -import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; -import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; -import de.lmu.ifi.dbs.elki.utilities.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -73,7 +71,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -93,7 +90,6 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * @author Elke Achtert * * @apiviz.uses DiSHPreferenceVectorIndex - * @apiviz.uses DiSHDistanceFunction * @apiviz.has SubspaceModel * * @param <V> the type of NumberVector handled by this Algorithm @@ -101,94 +97,61 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("DiSH: Detecting Subspace cluster Hierarchies") @Description("Algorithm to find hierarchical correlation clusters in subspaces.") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15") -public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +public class DiSH<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(DiSH.class); /** - * Parameter that specifies the maximum radius of the neighborhood to be - * considered in each dimension for determination of the preference vector, - * must be a double equal to or greater than 0. - * <p> - * Default value: {@code 0.001} - * </p> - * <p> - * Key: {@code -dish.epsilon} - * </p> - */ - public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", "The maximum radius of the neighborhood " + "to be considered in each dimension for determination of " + "the preference vector."); - - /** - * Parameter that specifies the a minimum number of points as a smoothing - * factor to avoid the single-link-effect, must be an integer greater than 0. - * <p> - * Default value: {@code 1} - * </p> - * <p> - * Key: {@code -dish.mu} - * </p> - */ - public static final OptionID MU_ID = new OptionID("dish.mu", "The minimum number of points as a smoothing factor to avoid the single-link-effekt."); - - /** - * Holds the value of {@link #EPSILON_ID}. + * Holds the value of {@link Parameterizer#EPSILON_ID}. */ private double epsilon; /** - * The distance function we use + * The DiSH preprocessor. */ - private DiSHDistanceFunction dishDistance; + private DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor; /** - * Parameters that were given to OPTICS + * OPTICS minPts parameter. */ - private Collection<Pair<OptionID, Object>> opticsAlgorithmParameters; + private int mu; /** * Constructor. * * @param epsilon Epsilon value - * @param dishDistance Distance function - * @param opticsAlgorithmParameters OPTICS parameters + * @param mu Mu parameter (minPts) + * @param dishPreprocessor DiSH preprocessor */ - public DiSH(double epsilon, DiSHDistanceFunction dishDistance, Collection<Pair<OptionID, Object>> opticsAlgorithmParameters) { + public DiSH(double epsilon, int mu, DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor) { super(); this.epsilon = epsilon; - this.dishDistance = dishDistance; - this.opticsAlgorithmParameters = opticsAlgorithmParameters; + this.mu = mu; + this.dishPreprocessor = dishPreprocessor; } /** * Performs the DiSH algorithm on the given database. * - * @param database Database to process * @param relation Relation to process */ - public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { - // Instantiate DiSH distance (and thus run the preprocessor) + public Clustering<SubspaceModel> run(Relation<V> relation) { if(LOG.isVerbose()) { - LOG.verbose("*** Run DiSH preprocessor."); + LOG.verbose("Running the DiSH preprocessor."); } - DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation); - // Configure and run OPTICS. + DiSHPreferenceVectorIndex<V> indexinst = dishPreprocessor.instantiate(relation); if(LOG.isVerbose()) { - LOG.verbose("*** Run OPTICS algorithm."); + LOG.verbose("Running the OPTICS algorithm."); } - ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters); - opticsconfig.addParameter(OPTICS.DISTANCE_FUNCTION_ID, ProxyDistanceFunction.proxy(dishDistanceQuery)); - Class<OPTICS<V, PreferenceVectorBasedCorrelationDistance>> cls = ClassGenericsUtil.uglyCastIntoSubclass(OPTICS.class); - OPTICS<V, PreferenceVectorBasedCorrelationDistance> optics = null; - optics = opticsconfig.tryInstantiate(cls); - ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation); + ClusterOrderResult<DiSHClusterOrderEntry> opticsResult = new DiSHOPTICS(indexinst).run(relation); if(LOG.isVerbose()) { - LOG.verbose("*** Compute Clusters."); + LOG.verbose("Compute Clusters."); } - return computeClusters(relation, opticsResult, dishDistanceQuery); + return computeClusters(relation, opticsResult); } /** @@ -196,58 +159,68 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * * @param database the database holding the objects * @param clusterOrder the cluster order - * @param distFunc Distance function */ - private Clustering<SubspaceModel<V>> computeClusters(Relation<V> database, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder, DiSHDistanceFunction.Instance<V> distFunc) { - int dimensionality = RelationUtil.dimensionality(database); - int minpts = dishDistance.getMinpts(); + private Clustering<SubspaceModel> computeClusters(Relation<V> database, ClusterOrderResult<DiSHClusterOrderEntry> clusterOrder) { + final int dimensionality = RelationUtil.dimensionality(database); // extract clusters - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder); + TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = extractClusters(database, clusterOrder); if(LOG.isVerbose()) { - StringBuilder msg = new StringBuilder("Step 1: extract clusters"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); + final StringBuilder msg = new StringBuilder("Step 1: extract clusters\n"); + clustersMap.forEachEntry(new TObjectObjectProcedure<long[], List<ArrayModifiableDBIDs>>() { + @Override + public boolean execute(long[] key, List<ArrayModifiableDBIDs> clusters) { + msg.append(BitsUtil.toStringLow(key, dimensionality)).append(" sizes:"); + for(ArrayModifiableDBIDs c : clusters) { + msg.append(' ').append(c.size()); + } + msg.append('\n'); + return true; // continue } - } + }); LOG.verbose(msg.toString()); } // check if there are clusters < minpts - checkClusters(database, distFunc, clustersMap, minpts); + checkClusters(database, clustersMap); if(LOG.isVerbose()) { - StringBuilder msg = new StringBuilder("Step 2: check clusters"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); + final StringBuilder msg = new StringBuilder("Step 2: check clusters\n"); + clustersMap.forEachEntry(new TObjectObjectProcedure<long[], List<ArrayModifiableDBIDs>>() { + @Override + public boolean execute(long[] key, List<ArrayModifiableDBIDs> clusters) { + msg.append(BitsUtil.toStringLow(key, dimensionality)).append(" sizes:"); + for(ArrayModifiableDBIDs c : clusters) { + msg.append(' ').append(c.size()); + } + msg.append('\n'); + return true; // continue } - } + }); LOG.verbose(msg.toString()); } // sort the clusters - List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap); + List<Cluster<SubspaceModel>> clusters = sortClusters(database, clustersMap); if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 3: sort clusters"); - for(Cluster<SubspaceModel<V>> c : clusters) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size()); + for(Cluster<SubspaceModel> c : clusters) { + msg.append('\n').append(BitsUtil.toStringLow(c.getModel().getSubspace().getDimensions(), dimensionality)).append(" ids ").append(c.size()); } LOG.verbose(msg.toString()); } // build the hierarchy - Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); - buildHierarchy(database, distFunc, clustering, clusters, dimensionality); + Clustering<SubspaceModel> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); + buildHierarchy(database, clustering, clusters, dimensionality); if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 4: build hierarchy"); - for(Cluster<SubspaceModel<V>> c : clusters) { - msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); - for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { + for(Cluster<SubspaceModel> c : clusters) { + msg.append('\n').append(BitsUtil.toStringLow(c.getModel().getSubspace().getDimensions(), dimensionality)).append(" ids ").append(c.size()); + for(Iter<Cluster<SubspaceModel>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { msg.append("\n parent ").append(iter.get()); } - for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { + for(Iter<Cluster<SubspaceModel>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { msg.append("\n child ").append(iter.get()); } } @@ -255,7 +228,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // build result - for(Cluster<SubspaceModel<V>> c : clusters) { + for(Cluster<SubspaceModel> c : clusters) { if(clustering.getClusterHierarchy().numParents(c) == 0) { clustering.addToplevelCluster(c); } @@ -267,37 +240,38 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Extracts the clusters from the cluster order. * * @param database the database storing the objects - * @param distFunc the distance function * @param clusterOrder the cluster order to extract the clusters from * @return the extracted clusters */ - private Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> extractClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder) { + private TCustomHashMap<long[], List<ArrayModifiableDBIDs>> extractClusters(Relation<V> database, ClusterOrderResult<DiSHClusterOrderEntry> clusterOrder) { FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), LOG) : null; int processed = 0; - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>(); - Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>(); - Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); - for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { - ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next(); + TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = new TCustomHashMap<>(BitsUtil.TROVE_HASH_STRATEGY); + // Note clusterOrder currently contains DBID objects anyway. + Map<DBID, DiSHClusterOrderEntry> entryMap = new HashMap<>(); + Map<DBID, Pair<long[], ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); + for(Iterator<DiSHClusterOrderEntry> it = clusterOrder.iterator(); it.hasNext();) { + DiSHClusterOrderEntry entry = it.next(); entryMap.put(entry.getID(), entry); V object = database.get(entry.getID()); - BitSet preferenceVector = entry.getReachability().getCommonPreferenceVector(); + long[] preferenceVector = entry.getCommonPreferenceVector(); // get the list of (parallel) clusters for the preference vector - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector); + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(preferenceVector); if(parallelClusters == null) { parallelClusters = new ArrayList<>(); clustersMap.put(preferenceVector, parallelClusters); } // look for the proper cluster - Pair<BitSet, ArrayModifiableDBIDs> cluster = null; - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database); - PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector); - if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { - double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector()); + ArrayModifiableDBIDs cluster = null; + for(ArrayModifiableDBIDs c : parallelClusters) { + Vector c_centroid = ProjectedCentroid.make(preferenceVector, database, c); + long[] commonPreferenceVector = BitsUtil.andCMin(preferenceVector, preferenceVector); + int subspaceDim = subspaceDimensionality(object, c_centroid, preferenceVector, preferenceVector, commonPreferenceVector); + if(subspaceDim == entry.getCorrelationValue()) { + double d = weightedDistance(object, c_centroid, commonPreferenceVector); if(d <= 2 * epsilon) { cluster = c; break; @@ -305,57 +279,56 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } if(cluster == null) { - cluster = new Pair<>(preferenceVector, DBIDUtil.newArray()); + cluster = DBIDUtil.newArray(); parallelClusters.add(cluster); } - cluster.second.add(entry.getID()); - entryToClusterMap.put(entry.getID(), cluster); + cluster.add(entry.getID()); + entryToClusterMap.put(entry.getID(), new Pair<>(preferenceVector, cluster)); if(progress != null) { progress.setProcessed(++processed, LOG); } } - if(progress != null) { - progress.ensureCompleted(LOG); - } + LOG.ensureCompleted(progress); if(LOG.isDebuggingFiner()) { + int dim = RelationUtil.dimensionality(database); StringBuilder msg = new StringBuilder("Step 0"); - for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { - msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size()); + for(Map.Entry<long[], List<ArrayModifiableDBIDs>> clusterList : clustersMap.entrySet()) { + for(ArrayModifiableDBIDs c : clusterList.getValue()) { + msg.append('\n').append(BitsUtil.toStringLow(clusterList.getKey(), dim)).append(" ids ").append(c.size()); } } LOG.debugFiner(msg.toString()); } // add the predecessor to the cluster - for(BitSet pv : clustersMap.keySet()) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { - if(cluster.second.isEmpty()) { + for(long[] pv : clustersMap.keySet()) { + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); + for(ArrayModifiableDBIDs cluster : parallelClusters) { + if(cluster.isEmpty()) { continue; } - DBID firstID = cluster.second.get(0); - ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID); + DBID firstID = cluster.get(0); + DiSHClusterOrderEntry entry = entryMap.get(firstID); DBID predecessorID = entry.getPredecessorID(); if(predecessorID == null) { continue; } - ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID); + DiSHClusterOrderEntry predecessor = entryMap.get(predecessorID); // parallel cluster - if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { + if(BitsUtil.equal(predecessor.getCommonPreferenceVector(), entry.getCommonPreferenceVector())) { continue; } - if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) { + if(predecessor.compareTo(entry) < 0) { continue; } - Pair<BitSet, ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID); + Pair<long[], ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID); oldCluster.second.remove(predecessorID); - cluster.second.add(predecessorID); + cluster.add(predecessorID); entryToClusterMap.remove(predecessorID); - entryToClusterMap.put(predecessorID, cluster); + entryToClusterMap.put(predecessorID, new Pair<>(pv, cluster)); } } @@ -366,21 +339,21 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Returns a sorted list of the clusters w.r.t. the subspace dimensionality in * descending order. * - * @param database the database storing the objects + * @param relation the database storing the objects * @param clustersMap the mapping of bits sets to clusters * @return a sorted list of the clusters */ - private List<Cluster<SubspaceModel<V>>> sortClusters(Relation<V> database, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { - final int db_dim = RelationUtil.dimensionality(database); + private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) { + final int db_dim = RelationUtil.dimensionality(relation); // int num = 1; - List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>(); - for(BitSet pv : clustersMap.keySet()) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); + List<Cluster<SubspaceModel>> clusters = new ArrayList<>(); + for(long[] pv : clustersMap.keySet()) { + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); for(int i = 0; i < parallelClusters.size(); i++) { - Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i); - Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second); - cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); - String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, ""); + ArrayModifiableDBIDs c = parallelClusters.get(i); + Cluster<SubspaceModel> cluster = new Cluster<>(c); + cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c))); + String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim); if(parallelClusters.size() > 1) { cluster.setName("Cluster_" + subspace + "_" + i); } @@ -391,12 +364,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } // sort the clusters w.r.t. lambda - Comparator<Cluster<SubspaceModel<V>>> comparator = new Comparator<Cluster<SubspaceModel<V>>>() { + Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() { @Override - public int compare(Cluster<SubspaceModel<V>> c1, Cluster<SubspaceModel<V>> c2) { + public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) { return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality(); } - }; Collections.sort(clusters, comparator); return clusters; @@ -406,32 +378,31 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Removes the clusters with size < minpts from the cluster map and adds them * to their parents. * - * @param database the database storing the objects - * @param distFunc the distance function + * @param relation the relation storing the objects * @param clustersMap the map containing the clusters - * @param minpts MinPts */ - private void checkClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap, int minpts) { + private void checkClusters(Relation<V> relation, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) { + final int dimensionality = RelationUtil.dimensionality(relation); // check if there are clusters < minpts // and add them to not assigned - List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>(); - Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray()); - for(BitSet pv : clustersMap.keySet()) { + List<Pair<long[], ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); + TCustomHashMap<long[], List<ArrayModifiableDBIDs>> newClustersMap = new TCustomHashMap<>(BitsUtil.TROVE_HASH_STRATEGY); + Pair<long[], ArrayModifiableDBIDs> noise = new Pair<>(BitsUtil.zero(dimensionality), DBIDUtil.newArray()); + for(long[] pv : clustersMap.keySet()) { // noise - if(pv.cardinality() == 0) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - noise.second.addDBIDs(c.second); + if(BitsUtil.cardinality(pv) == 0) { + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); + for(ArrayModifiableDBIDs c : parallelClusters) { + noise.second.addDBIDs(c); } } // clusters else { - List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size()); - for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - if(!pv.equals(new BitSet()) && c.second.size() < minpts) { - notAssigned.add(c); + List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv); + List<ArrayModifiableDBIDs> newParallelClusters = new ArrayList<>(parallelClusters.size()); + for(ArrayModifiableDBIDs c : parallelClusters) { + if(!BitsUtil.isZero(pv) && c.size() < mu) { + notAssigned.add(new Pair<>(pv, c)); } else { newParallelClusters.add(c); @@ -444,11 +415,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin clustersMap.clear(); clustersMap.putAll(newClustersMap); - for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { + for(Pair<long[], ArrayModifiableDBIDs> c : notAssigned) { if(c.second.isEmpty()) { continue; } - Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap); + Pair<long[], ArrayModifiableDBIDs> parent = findParent(relation, c, clustersMap); if(parent != null) { parent.second.addDBIDs(c.second); } @@ -457,30 +428,29 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } - List<Pair<BitSet, ArrayModifiableDBIDs>> noiseList = new ArrayList<>(1); - noiseList.add(noise); + List<ArrayModifiableDBIDs> noiseList = new ArrayList<>(1); + noiseList.add(noise.second); clustersMap.put(noise.first, noiseList); } /** * Returns the parent of the specified cluster * - * @param database the database storing the objects - * @param distFunc the distance function + * @param relation the relation storing the objects * @param child the child to search the parent for * @param clustersMap the map containing the clusters * @return the parent of the specified cluster */ - private Pair<BitSet, ArrayModifiableDBIDs> findParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Pair<BitSet, ArrayModifiableDBIDs> child, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { - V child_centroid = ProjectedCentroid.make(child.first, database, child.second).toVector(database); + private Pair<long[], ArrayModifiableDBIDs> findParent(Relation<V> relation, Pair<long[], ArrayModifiableDBIDs> child, TCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap) { + Vector child_centroid = ProjectedCentroid.make(child.first, relation, child.second); - Pair<BitSet, ArrayModifiableDBIDs> result = null; + Pair<long[], ArrayModifiableDBIDs> result = null; int resultCardinality = -1; - BitSet childPV = child.first; - int childCardinality = childPV.cardinality(); - for(BitSet parentPV : clustersMap.keySet()) { - int parentCardinality = parentPV.cardinality(); + long[] childPV = child.first; + int childCardinality = BitsUtil.cardinality(childPV); + for(long[] parentPV : clustersMap.keySet()) { + int parentCardinality = BitsUtil.cardinality(parentPV); if(parentCardinality >= childCardinality) { continue; } @@ -488,15 +458,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin continue; } - BitSet pv = (BitSet) childPV.clone(); - pv.and(parentPV); + long[] pv = BitsUtil.andCMin(childPV, parentPV); if(pv.equals(parentPV)) { - List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV); - for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { - V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database); - double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV); + List<ArrayModifiableDBIDs> parentList = clustersMap.get(parentPV); + for(ArrayModifiableDBIDs parent : parentList) { + Vector parent_centroid = ProjectedCentroid.make(parentPV, relation, parent); + double d = weightedDistance(child_centroid, parent_centroid, parentPV); if(d <= 2 * epsilon) { - result = parent; + result = new Pair<>(parentPV, parent); resultCardinality = parentCardinality; break; } @@ -510,65 +479,70 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin /** * Builds the cluster hierarchy. * - * @param distFunc the distance function * @param clustering Clustering we process * @param clusters the sorted list of clusters * @param dimensionality the dimensionality of the data * @param database the database containing the data objects */ - private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Clustering<SubspaceModel<V>> clustering, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) { - StringBuilder msg = new StringBuilder(); + private void buildHierarchy(Relation<V> database, Clustering<SubspaceModel> clustering, List<Cluster<SubspaceModel>> clusters, int dimensionality) { + StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null; final int db_dim = RelationUtil.dimensionality(database); - Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy(); + Hierarchy<Cluster<SubspaceModel>> hier = clustering.getClusterHierarchy(); for(int i = 0; i < clusters.size() - 1; i++) { - Cluster<SubspaceModel<V>> c_i = clusters.get(i); - int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality(); - V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database); + Cluster<SubspaceModel> c_i = clusters.get(i); + final Subspace s_i = c_i.getModel().getSubspace(); + int subspaceDim_i = dimensionality - s_i.dimensionality(); + Vector ci_centroid = ProjectedCentroid.make(s_i.getDimensions(), database, c_i.getIDs()); + long[] pv1 = s_i.getDimensions(); for(int j = i + 1; j < clusters.size(); j++) { - Cluster<SubspaceModel<V>> c_j = clusters.get(j); - int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality(); + Cluster<SubspaceModel> c_j = clusters.get(j); + final Subspace s_j = c_j.getModel().getSubspace(); + int subspaceDim_j = dimensionality - s_j.dimensionality(); if(subspaceDim_i < subspaceDim_j) { - if(LOG.isDebugging()) { - msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']'); - msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']'); + if(msg != null) { + msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)).append(']'); + msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)).append(']'); } // noise level reached - if(c_j.getModel().getSubspace().dimensionality() == 0) { + if(s_j.dimensionality() == 0) { // no parents exists -> parent is noise if(hier.numParents(c_i) == 0) { clustering.addChildCluster(c_j, c_i); - if(LOG.isDebugging()) { - msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); - msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); + if(msg != null) { + msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)); + msg.append("] is parent of [").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)); msg.append(']'); } } } else { - V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database); - PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions()); - double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector()); - if(LOG.isDebugging()) { - msg.append("\n dist = ").append(distance.getCorrelationValue()); + Vector cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()); + long[] pv2 = s_j.getDimensions(); + long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2); + int subspaceDim = subspaceDimensionality(ci_centroid, cj_centroid, pv1, pv2, commonPreferenceVector); + + double d = weightedDistance(ci_centroid, cj_centroid, commonPreferenceVector); + if(msg != null) { + msg.append("\n dist = ").append(subspaceDim); } - if(distance.getCorrelationValue() == subspaceDim_j) { - if(LOG.isDebugging()) { + if(subspaceDim == subspaceDim_j) { + if(msg != null) { msg.append("\n d = ").append(d); } if(d <= 2 * epsilon) { // no parent exists or c_j is not a parent of the already // existing parents - if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { + if(hier.numParents(c_i) == 0 || !isParent(database, c_j, hier.iterParents(c_i), db_dim)) { clustering.addChildCluster(c_j, c_i); - if(LOG.isDebugging()) { - msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); + if(msg != null) { + msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)); msg.append("] is parent of ["); - msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); + msg.append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)); msg.append(']'); } } @@ -581,7 +555,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } } - if(LOG.isDebugging()) { + if(msg != null) { LOG.debug(msg.toString()); } } @@ -590,31 +564,75 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Returns true, if the specified parent cluster is a parent of one child of * the children clusters. * - * @param database the database containing the objects - * @param distFunc the distance function for distance computation between the - * clusters + * @param relation the database containing the objects * @param parent the parent to be tested * @param iter the list of children to be tested + * @param db_dim Database dimensionality * @return true, if the specified parent cluster is a parent of one child of * the children clusters, false otherwise */ - private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, Iter<Cluster<SubspaceModel<V>>> iter) { - V parent_centroid = ProjectedCentroid.make(parent.getModel().getDimensions(), database, parent.getIDs()).toVector(database); - int dimensionality = RelationUtil.dimensionality(database); - int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality(); + private boolean isParent(Relation<V> relation, Cluster<SubspaceModel> parent, Iter<Cluster<SubspaceModel>> iter, int db_dim) { + Subspace s_p = parent.getModel().getSubspace(); + Vector parent_centroid = ProjectedCentroid.make(s_p.getDimensions(), relation, parent.getIDs()); + int subspaceDim_parent = db_dim - s_p.dimensionality(); for(; iter.valid(); iter.advance()) { - Cluster<SubspaceModel<V>> child = iter.get(); - V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database); - PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions()); - if(distance.getCorrelationValue() == subspaceDim_parent) { + Cluster<SubspaceModel> child = iter.get(); + Subspace s_c = child.getModel().getSubspace(); + Vector child_centroid = ProjectedCentroid.make(s_c.getDimensions(), relation, child.getIDs()); + long[] commonPreferenceVector = BitsUtil.andCMin(s_p.getDimensions(), s_c.getDimensions()); + int subspaceDim = subspaceDimensionality(parent_centroid, child_centroid, s_p.getDimensions(), s_c.getDimensions(), commonPreferenceVector); + if(subspaceDim == subspaceDim_parent) { return true; } } - return false; } + /** + * Compute the common subspace dimensionality of two vectors. + * + * @param v1 First vector + * @param v2 Second vector + * @param pv1 First preference + * @param pv2 Second preference + * @param commonPreferenceVector Common preference + * @return Usually, v1.dim - commonPreference.cardinality, unless either pv1 + * and pv2 are a subset of the other. + */ + private int subspaceDimensionality(NumberVector v1, NumberVector v2, long[] pv1, long[] pv2, long[] commonPreferenceVector) { + // number of zero values in commonPreferenceVector + int subspaceDim = v1.getDimensionality() - BitsUtil.cardinality(commonPreferenceVector); + + // special case: v1 and v2 are in parallel subspaces + if(BitsUtil.equal(commonPreferenceVector, pv1) || BitsUtil.equal(commonPreferenceVector, pv2)) { + double d = weightedDistance(v1, v2, commonPreferenceVector); + if(d > 2 * epsilon) { + subspaceDim++; + } + } + return subspaceDim; + } + + /** + * Computes the weighted distance between the two specified vectors according + * to the given preference vector. + * + * @param v1 the first vector + * @param v2 the second vector + * @param weightVector the preference vector + * @return the weighted distance between the two specified vectors according + * to the given preference vector + */ + protected static double weightedDistance(NumberVector v1, NumberVector v2, long[] weightVector) { + double sqrDist = 0; + for(int i = BitsUtil.nextSetBit(weightVector, 0); i >= 0; i = BitsUtil.nextSetBit(weightVector, i + 1)) { + double manhattanI = v1.doubleValue(i) - v2.doubleValue(i); + sqrDist += manhattanI * manhattanI; + } + return Math.sqrt(sqrDist); + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); @@ -626,20 +644,199 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } /** + * OPTICS variant used by DiSH internally. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public class DiSHOPTICS extends GeneralizedOPTICS<V, DiSHClusterOrderEntry> { + /** + * DiSH preprocessor instance. + */ + private DiSHPreferenceVectorIndex<V> index; + + /** + * Constructor. + * + * @param indexinst Preprocessor instance. + */ + public DiSHOPTICS(DiSHPreferenceVectorIndex<V> indexinst) { + super(mu); + this.index = indexinst; + } + + @Override + public Class<? super DiSHClusterOrderEntry> getEntryType() { + return DiSHClusterOrderEntry.class; + } + + @Override + protected DiSHClusterOrderEntry makeSeedEntry(Relation<V> relation, DBID objectID) { + return new DiSHClusterOrderEntry(objectID, null, Integer.MAX_VALUE, Double.POSITIVE_INFINITY, new long[0]); + } + + @Override + protected Collection<DiSHClusterOrderEntry> getNeighborsForDBID(Relation<V> relation, DBID id) { + DBID id1 = DBIDUtil.deref(id); + long[] pv1 = index.getPreferenceVector(id1); + V dv1 = relation.get(id1); + final int dim = dv1.getDimensionality(); + + long[] ones = BitsUtil.ones(dim); + long[] inverseCommonPreferenceVector = BitsUtil.ones(dim); + + ArrayList<DiSHClusterOrderEntry> result = new ArrayList<>(); + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + long[] pv2 = index.getPreferenceVector(iter); + V dv2 = relation.get(iter); + // We need a copy of this for the distance. + long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2); + + // number of zero values in commonPreferenceVector + int subspaceDim = dim - BitsUtil.cardinality(commonPreferenceVector); + + // special case: v1 and v2 are in parallel subspaces + if(BitsUtil.equal(commonPreferenceVector, pv1) || BitsUtil.equal(commonPreferenceVector, pv2)) { + double d = weightedDistance(dv1, dv2, commonPreferenceVector); + if(d > 2 * epsilon) { + subspaceDim++; + } + } + + // flip commonPreferenceVector for distance computation in common + // subspace + System.arraycopy(ones, 0, inverseCommonPreferenceVector, 0, ones.length); + BitsUtil.xorI(inverseCommonPreferenceVector, commonPreferenceVector); + + final double orthogonalDistance = weightedDistance(dv1, dv2, inverseCommonPreferenceVector); + result.add(new DiSHClusterOrderEntry(DBIDUtil.deref(iter), id1, subspaceDim, orthogonalDistance, commonPreferenceVector)); + } + Collections.sort(result); + // This is a hack, but needed to enforce core-distance of OPTICS: + if(result.size() >= getMinPts()) { + DiSHClusterOrderEntry coredist = result.get(getMinPts() - 1); + for(int i = 0; i < getMinPts() - 1; i++) { + final DiSHClusterOrderEntry prev = result.get(i); + result.set(i, new DiSHClusterOrderEntry(prev.getID(), id1, coredist.getCorrelationValue(), coredist.getEuclideanValue(), coredist.commonPreferenceVector)); + } + } + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.DOUBLE_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + } + + /** + * Cluster order entry for DiSH. + * + * @author Elke Achtert + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class DiSHClusterOrderEntry extends CorrelationClusterOrderEntry<DiSHClusterOrderEntry> { + /** + * The common preference vector of the two objects defining this distance. + */ + private long[] commonPreferenceVector; + + /** + * Constructs a new CorrelationDistance object. + * + * @param correlationValue the correlation dimension to be represented by + * the CorrelationDistance + * @param euclideanValue the Euclidean distance to be represented by the + * CorrelationDistance + * @param commonPreferenceVector the common preference vector of the two + * objects defining this distance + */ + public DiSHClusterOrderEntry(DBID objectID, DBID predecessorID, int correlationValue, double euclideanValue, long[] commonPreferenceVector) { + super(objectID, predecessorID, correlationValue, euclideanValue); + this.commonPreferenceVector = commonPreferenceVector; + } + + /** + * Returns the common preference vector of the two objects defining this + * distance. + * + * @return the common preference vector + */ + public long[] getCommonPreferenceVector() { + return commonPreferenceVector; + } + + /** + * Returns a string representation of this + * PreferenceVectorBasedCorrelationDistance. + * + * @return the correlation value, the Euclidean value and the common + * preference vector separated by blanks + */ + @Override + public String toString() { + return super.toString() + SEPARATOR + commonPreferenceVector.toString(); + } + + @Override + public int compareTo(DiSHClusterOrderEntry other) { + return super.compareTo(other); + } + } + + /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * Parameter that specifies the maximum radius of the neighborhood to be + * considered in each dimension for determination of the preference vector, + * must be a double equal to or greater than 0. + * <p> + * Default value: {@code 0.001} + * </p> + * <p> + * Key: {@code -dish.epsilon} + * </p> + */ + public static final OptionID EPSILON_ID = new OptionID("dish.epsilon", // + "The maximum radius of the neighborhood to be considered in each " // + + " dimension for determination of the preference vector."); + + /** + * Parameter that specifies the a minimum number of points as a smoothing + * factor to avoid the single-link-effect, must be an integer greater than + * 0. + * <p> + * Default value: {@code 1} + * </p> + * <p> + * Key: {@code -dish.mu} + * </p> + */ + public static final OptionID MU_ID = new OptionID("dish.mu", // + "The minimum number of points as a smoothing factor to avoid the single-link-effekt."); + protected double epsilon = 0.0; protected int mu = 1; - protected DiSHDistanceFunction dishDistance; - - protected Collection<Pair<OptionID, Object>> opticsO; + /** + * DiSH preprocessor. + */ + protected DiSHPreferenceVectorIndex.Factory<V> dishPreprocessor; @Override protected void makeOptions(Parameterization config) { @@ -657,53 +854,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin mu = muP.intValue(); } - configDiSHDistance(config, epsilon, mu); - - configOPTICS(config, mu, dishDistance); + configDiSHPreprocessor(config, epsilon, mu); } - public void configDiSHDistance(Parameterization config, double epsilon, int minpts) { + public void configDiSHPreprocessor(Parameterization config, double epsilon, int minpts) { ListParameterization dishParameters = new ListParameterization(); - dishParameters.addParameter(DiSHDistanceFunction.EPSILON_ID, epsilon); - dishParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, DiSHPreferenceVectorIndex.Factory.class); - dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, Double.toString(epsilon)); + dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, epsilon); dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.MINPTS_ID, minpts); ChainedParameterization dishchain = new ChainedParameterization(dishParameters, config); dishchain.errorsTo(config); - dishDistance = dishchain.tryInstantiate(DiSHDistanceFunction.class); - } - - /** - * Get the parameters for embedded OPTICS. - * - * @param config Parameterization - * @param minpts MinPts value - * @param dishDistance DiSH distance function - */ - public void configOPTICS(Parameterization config, final int minpts, final DiSHDistanceFunction dishDistance) { - // Configure OPTICS. Tracked parameters - ListParameterization opticsParameters = new ListParameterization(); - opticsParameters.addParameter(OPTICS.EPSILON_ID, AbstractDistance.INFINITY_PATTERN); - opticsParameters.addParameter(OPTICS.MINPTS_ID, minpts); - // Configure OPTICS. Untracked parameters - ListParameterization opticsUntrackedParameters = new ListParameterization(); - opticsUntrackedParameters.addParameter(OPTICS.DISTANCE_FUNCTION_ID, dishDistance); - ChainedParameterization optchain = new ChainedParameterization(opticsParameters, config); - TrackParameters trackpar = new TrackParameters(optchain); - - ChainedParameterization optchain2 = new ChainedParameterization(opticsUntrackedParameters, trackpar); - optchain2.errorsTo(config); - - // Instantiate OPTICS for parameterization - optchain2.tryInstantiate(OPTICS.class); - // store parameters - opticsO = trackpar.getGivenParameters(); + final Class<DiSHPreferenceVectorIndex.Factory<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DiSHPreferenceVectorIndex.Factory.class); + dishPreprocessor = dishchain.tryInstantiate(cls); } @Override protected DiSH<V> makeInstance() { - return new DiSH<>(epsilon, dishDistance, opticsO); + return new DiSH<>(epsilon, mu, dishPreprocessor); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 3f135564..b0ad5eff 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -1,39 +1,53 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
-/* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures - -Copyright (C) 2013 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ -
-import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.ClusterOrderResult;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.CorrelationClusterOrderEntry;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.optics.GeneralizedOPTICS;
import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.HiSCDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelationDistance;
+import de.lmu.ifi.dbs.elki.index.IndexFactory;
import de.lmu.ifi.dbs.elki.index.preprocessed.preference.HiSCPreferenceVectorIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
@@ -53,26 +67,135 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; * @author Elke Achtert
*
* @apiviz.uses HiSCPreferenceVectorIndex
- * @apiviz.uses HiSCDistanceFunction
*
* @param <V> the type of NumberVector handled by the algorithm
*/
@Title("Finding Hierarchies of Subspace Clusters")
@Description("Algorithm for detecting hierarchies of subspace clusters.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Finding Hierarchies of Subspace Clusters", booktitle = "Proc. 10th Europ. Conf. on Principles and Practice of Knowledge Discovery in Databases (PKDD'06), Berlin, Germany, 2006", url = "http://www.dbs.ifi.lmu.de/Publikationen/Papers/PKDD06-HiSC.pdf")
-public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorBasedCorrelationDistance> {
+public class HiSC<V extends NumberVector> extends GeneralizedOPTICS<V, HiSC.HiSCClusterOrderEntry> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(HiSC.class);
/**
+ * Factory to produce
+ */
+ private IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>> indexfactory;
+
+ /**
+ * Instantiated index.
+ */
+ private HiSCPreferenceVectorIndex<NumberVector> index;
+
+ /**
+ * Relation we are currently processing.
+ */
+ private Relation<V> relation;
+
+ /**
+ * Holds the maximum diversion allowed.
+ */
+ private double alpha;
+
+ /**
* Constructor.
- *
- * @param distanceFunction HiSC distance function used
+ *
+ * @param indexfactory HiSC index factory
+ */
+ public HiSC(IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>> indexfactory, double epsilon) {
+ super(2);
+ this.indexfactory = indexfactory;
+ this.alpha = epsilon;
+ }
+
+ @Override
+ public ClusterOrderResult<HiSCClusterOrderEntry> run(Relation<V> relation) {
+ assert (this.index == null && this.relation == null) : "Running algorithm instance multiple times in parallel is not supported.";
+ this.index = indexfactory.instantiate(relation);
+ this.relation = relation;
+ ClusterOrderResult<HiSCClusterOrderEntry> result = super.run(relation);
+ this.index = null;
+ this.relation = null;
+ return result;
+ }
+
+ @Override
+ protected HiSCClusterOrderEntry makeSeedEntry(Relation<V> relation, DBID objectID) {
+ return new HiSCClusterOrderEntry(objectID, null, Integer.MAX_VALUE, Double.POSITIVE_INFINITY, new long[0]);
+ }
+
+ @Override
+ protected Collection<HiSCClusterOrderEntry> getNeighborsForDBID(Relation<V> relation, DBID id) {
+ DBID id1 = DBIDUtil.deref(id);
+ long[] pv1 = index.getPreferenceVector(id1);
+ V v1 = relation.get(id1);
+ final int dim = v1.getDimensionality();
+
+ ArrayList<HiSCClusterOrderEntry> result = new ArrayList<>();
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ long[] pv2 = index.getPreferenceVector(iter);
+ V v2 = relation.get(iter);
+ final long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2);
+
+ // number of zero values in commonPreferenceVector
+ int subspaceDim = dim - BitsUtil.cardinality(commonPreferenceVector);
+
+ // special case: v1 and v2 are in parallel subspaces
+ double dist1 = weightedDistance(v1, v2, pv1);
+ double dist2 = weightedDistance(v1, v2, pv2);
+
+ if(Math.max(dist1, dist2) > alpha) {
+ subspaceDim++;
+ if(LOG.isDebugging()) {
+ StringBuilder msg = new StringBuilder();
+ msg.append("\ndist1 ").append(dist1);
+ msg.append("\ndist2 ").append(dist2);
+ msg.append("\nsubspaceDim ").append(subspaceDim);
+ msg.append("\ncommon pv ").append(BitsUtil.toStringLow(commonPreferenceVector, dim));
+ LOG.debugFine(msg.toString());
+ }
+ }
+
+ // flip commonPreferenceVector for distance computation in common subspace
+ long[] inverseCommonPreferenceVector = BitsUtil.ones(dim);
+ BitsUtil.xorI(inverseCommonPreferenceVector, commonPreferenceVector);
+
+ final double orthogonalDistance = weightedDistance(v1, v2, inverseCommonPreferenceVector);
+ result.add(new HiSCClusterOrderEntry(DBIDUtil.deref(iter), id1, subspaceDim, orthogonalDistance, commonPreferenceVector));
+ }
+ Collections.sort(result);
+ return result;
+ }
+
+ /**
+ * Computes the weighted distance between the two specified vectors according
+ * to the given preference vector.
+ *
+ * @param v1 the first vector
+ * @param v2 the second vector
+ * @param weightVector the preference vector
+ * @return the weighted distance between the two specified vectors according
+ * to the given preference vector
*/
- public HiSC(HiSCDistanceFunction<V> distanceFunction) {
- super(distanceFunction, distanceFunction.getDistanceFactory().infiniteDistance(), 2);
+ public double weightedDistance(V v1, V v2, long[] weightVector) {
+ double sqrDist = 0.;
+ for(int i = BitsUtil.nextSetBit(weightVector, 0); i >= 0; i = BitsUtil.nextSetBit(weightVector, i + 1)) {
+ double manhattanI = v1.doubleValue(i) - v2.doubleValue(i);
+ sqrDist += manhattanI * manhattanI;
+ }
+ return Math.sqrt(sqrDist);
+ }
+
+ @Override
+ public Class<? super HiSCClusterOrderEntry> getEntryType() {
+ return HiSCClusterOrderEntry.class;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(indexfactory.getInputTypeRestriction());
}
@Override
@@ -81,22 +204,99 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB }
/**
+ * Cluster order entry for HiSC.
+ *
+ * @author Elke Achtert
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class HiSCClusterOrderEntry extends CorrelationClusterOrderEntry<HiSCClusterOrderEntry> {
+ /**
+ * The common preference vector of the two objects defining this distance.
+ */
+ private long[] commonPreferenceVector;
+
+ /**
+ * Constructs a new CorrelationDistance object.
+ *
+ * @param correlationValue the correlation dimension to be represented by
+ * the CorrelationDistance
+ * @param euclideanValue the Euclidean distance to be represented by the
+ * CorrelationDistance
+ * @param commonPreferenceVector the common preference vector of the two
+ * objects defining this distance
+ */
+ public HiSCClusterOrderEntry(DBID objectID, DBID predecessorID, int correlationValue, double euclideanValue, long[] commonPreferenceVector) {
+ super(objectID, predecessorID, correlationValue, euclideanValue);
+ this.commonPreferenceVector = commonPreferenceVector;
+ }
+
+ /**
+ * Returns the common preference vector of the two objects defining this
+ * distance.
+ *
+ * @return the common preference vector
+ */
+ public long[] getCommonPreferenceVector() {
+ return commonPreferenceVector;
+ }
+
+ /**
+ * Returns a string representation of this
+ * PreferenceVectorBasedCorrelationDistance.
+ *
+ * @return the correlation value, the Euclidean value and the common
+ * preference vector separated by blanks
+ */
+ @Override
+ public String toString() {
+ return super.toString() + SEPARATOR + commonPreferenceVector.toString();
+ }
+
+ @Override
+ public int compareTo(HiSCClusterOrderEntry other) {
+ return super.compareTo(other);
+ }
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
- HiSCDistanceFunction<V> distanceFunction;
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the maximum distance between two vectors with equal
+ * preference vectors before considering them as parallel, must be a double
+ * equal to or greater than 0.
+ * <p>
+ * Default value: {@code 0.001}
+ * </p>
+ * <p>
+ * Key: {@code -hisc.epsilon}
+ * </p>
+ */
+ public static final OptionID EPSILON_ID = new OptionID("hisc.epsilon", "The maximum distance between two vectors with equal preference vectors before considering them as parallel.");
+
+ /**
+ * Factory to produce the index.
+ */
+ private IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>> indexfactory;
+
+ /**
+ * Alpha parameter.
+ */
+ double alpha;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
- alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
- double alpha = 0.0;
if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
@@ -104,21 +304,19 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB // Configure HiSC distance function
ListParameterization opticsParameters = new ListParameterization();
- // distance function
- opticsParameters.addParameter(HiSCDistanceFunction.EPSILON_ID, alpha);
// preprocessor
opticsParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, HiSCPreferenceVectorIndex.Factory.class);
opticsParameters.addParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, alpha);
ChainedParameterization chain = new ChainedParameterization(opticsParameters, config);
chain.errorsTo(config);
- Class<HiSCDistanceFunction<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(HiSCDistanceFunction.class);
- distanceFunction = chain.tryInstantiate(cls);
+ final Class<? extends IndexFactory<V, HiSCPreferenceVectorIndex<NumberVector>>> cls = ClassGenericsUtil.uglyCrossCast(HiSCPreferenceVectorIndex.Factory.class, IndexFactory.class);
+ indexfactory = chain.tryInstantiate(cls);
}
@Override
protected HiSC<V> makeInstance() {
- return new HiSC<>(distanceFunction);
+ return new HiSC<>(indexfactory, alpha);
}
}
}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java index 9d1ee94d..4e1de7c7 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,12 +25,13 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; import java.util.ArrayList;
import java.util.Arrays;
-import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.EM;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.EMClusterModel;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -61,7 +62,6 @@ import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
@@ -103,7 +103,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; */
@Title("P3C: A Robust Projected Clustering Algorithm.")
@Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
-public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+public class P3C<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
@@ -156,7 +156,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering /**
* Performs the P3C algorithm on the given Database.
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
@@ -167,7 +167,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
// Desired number of bins, as per Sturge:
- final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+ final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
@@ -178,7 +178,6 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering // Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
- int numuniform = 0;
for(int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if(parts == null) {
@@ -191,7 +190,6 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering // previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if(bestBin < 0) {
- numuniform++;
break; // Uniform
}
BitsUtil.setI(marked, bestBin);
@@ -224,9 +222,9 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
if(clusterCores.size() == 0) {
- stepProgress.setCompleted(LOG);
- Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
- c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+ LOG.setCompleted(stepProgress);
+ Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
+ c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
@@ -238,26 +236,19 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
- double[] clusterWeights = new double[k];
- computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+ List<MultivariateGaussianModel> models = new ArrayList<>(k);
+ computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
- Vector[] means = new Vector[k];
- Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
- final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
- double[] normDistrFactor = new double[k];
- Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
- EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
- EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
- assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
-
- double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
+ assignUnassigned(relation, probClusterIGivenX, models, noise);
+
+ double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
- EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
- EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models);
// reassign probabilities
- emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if(LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
@@ -283,7 +274,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering // Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
- findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+ findOutliers(relation, models, clusterCandidates, noise);
if(stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
@@ -312,20 +303,18 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
// Generate final output.
- Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+ Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
- result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+ result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if(noise.size() > 0) {
- result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+ result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
- if(stepProgress != null) {
- stepProgress.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(stepProgress);
return result;
}
@@ -500,7 +489,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param parts Parts array
* @param start Array start index
* @param end Array end index (exclusive)
- * @return
+ * @return Union
*/
protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
int sum = 0;
@@ -561,12 +550,15 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param clusterCores the cluster cores.
* @param unassigned set to which to add unassigned points.
* @param probClusterIGivenX Membership probabilities.
- * @param clusterWeights Cluster weights
+ * @param models Cluster models.
+ * @param dim Dimensionality
*/
- private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+ private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, int dim) {
final int n = relation.size();
+ final double pweight = 1. / n; // Weight of each point
final int k = clusterCores.size();
+ double[] clusterWeights = new double[k];
for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
int count = 0;
double[] weights = new double[k];
@@ -581,7 +573,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering if(count > 0) {
// Rescale.
VMath.timesEquals(weights, 1. / count);
- VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+ VMath.plusTimesEquals(clusterWeights, weights, pweight);
}
else {
// Does not match any cluster, mark it.
@@ -589,6 +581,10 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering }
probClusterIGivenX.put(iter, weights);
}
+ final double f = MathUtil.powi(MathUtil.TWOPI, dim); // Scaling coefficient
+ for(int i = 0; i < k; i++) {
+ models.add(new MultivariateGaussianModel(clusterWeights[i], new Vector(dim), f));
+ }
}
/**
@@ -597,36 +593,44 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering *
* @param relation Data relation
* @param probClusterIGivenX fuzzy membership matrix.
- * @param means Cluster means.
- * @param invCovMatr Cluster covariance matrices.
- * @param clusterWeights
- * @param assigned mapping of matrix row to DBID.
+ * @param models Cluster models.
* @param unassigned the list of points not yet assigned.
*/
- private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+ private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, ModifiableDBIDs unassigned) {
if(unassigned.size() == 0) {
return;
}
- final int k = means.length;
+ final int k = models.size();
double pweight = 1. / relation.size();
+ // Rescale weights, to take unassigned points into account:
+ for(EMClusterModel<?> m : models) {
+ m.setWeight(m.getWeight() * (relation.size() - unassigned.size()) * pweight);
+ }
+
+ // Assign noise objects, increase weights accordingly.
for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
// Find the best matching known cluster core using the Mahalanobis
// distance.
- Vector v = relation.get(iter).getColumnVector();
+ V v = relation.get(iter);
int bestCluster = -1;
+ MultivariateGaussianModel bestModel = null;
double minDistance = Double.POSITIVE_INFINITY;
- for(int c = 0; c < k; ++c) {
- final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+ int c = 0;
+ for(MultivariateGaussianModel model : models) {
+ final double distance = model.mahalanobisDistance(v);
if(distance < minDistance) {
minDistance = distance;
bestCluster = c;
+ bestModel = model;
}
+ c++;
}
// Assign to best core.
double[] weights = new double[k];
- weights[bestCluster] = 1.0;
- clusterWeights[bestCluster] += pweight;
+ weights[bestCluster] = 1.;
+
+ bestModel.setWeight(bestModel.getWeight() + pweight);
probClusterIGivenX.put(iter, weights);
}
@@ -675,28 +679,19 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * attributes.
*
* @param relation Data relation
- * @param means Cluster means
- * @param invCovMatr Inverse covariance matrixes
+ * @param models Cluster models
* @param clusterCandidates the list of clusters to check.
- * @param nonUniformDimensionCount the number of dimensions to consider when
- * testing.
* @param noise the set to which to add points deemed outliers.
*/
- private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
- final int k = clusterCandidates.size();
-
- for(int c = 0; c < k; ++c) {
+ private void findOutliers(Relation<V> relation, List<MultivariateGaussianModel> models, ArrayList<ClusterCandidate> clusterCandidates, ModifiableDBIDs noise) {
+ Iterator<MultivariateGaussianModel> it = models.iterator();
+ for(int c = 0; it.hasNext(); c++) {
+ MultivariateGaussianModel model = it.next();
final ClusterCandidate candidate = clusterCandidates.get(c);
- if(candidate.ids.size() < 2) {
- continue;
- }
- final int dof = candidate.dimensions.cardinality();
- final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+ final int dof = BitsUtil.cardinality(candidate.dimensions);
+ final double threshold = ChiSquaredDistribution.quantile(1 - alpha, dof);
for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
- final Vector mean = means[c];
- final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
- final Matrix invCov = invCovMatr[c];
- final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+ final double distance = model.mahalanobisDistance(relation.get(iter));
if(distance >= threshold) {
// Outlier, remove it and add it to the outlier set.
noise.add(iter);
@@ -851,7 +846,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering /**
* Selected dimensions
*/
- public final BitSet dimensions;
+ public final long[] dimensions;
/**
* Objects contained in cluster.
@@ -864,9 +859,9 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering * @param clusterCore Signature
*/
public ClusterCandidate(Signature clusterCore) {
- this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+ this.dimensions = BitsUtil.zero(clusterCore.spec.length >> 1);
for(int i = 0; i < clusterCore.spec.length; i += 2) {
- this.dimensions.set(i >> 1);
+ BitsUtil.setI(this.dimensions, i >> 1);
}
this.ids = DBIDUtil.newArray(clusterCore.ids.size());
}
@@ -889,7 +884,7 @@ public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering *
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Parameter for the chi squared test threshold.
*/
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 03e9978f..805281b0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -28,7 +28,6 @@ import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import java.util.ArrayList; -import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -44,25 +43,32 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultUtil; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -71,13 +77,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.CTriple; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * <p/> - * Provides the PROCLUS algorithm, an algorithm to find subspace clusters in - * high dimensional spaces. + * The PROCLUS algorithm, an algorithm to find subspace clusters in high + * dimensional spaces. * </p> * <p/> * Reference: <br> @@ -92,30 +97,20 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param <V> the type of NumberVector handled by this Algorithm */ -// TODO: optimize by creating much less objects @Title("PROCLUS: PROjected CLUStering") @Description("Algorithm to find subspace clusters in high dimensional spaces.") -@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188") -public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", // +title = "Fast Algorithms for Projected Clustering", // +booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", // +url = "http://dx.doi.org/10.1145/304181.304188") +public class PROCLUS<V extends NumberVector> extends AbstractProjectedClustering<Clustering<SubspaceModel>, V> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(PROCLUS.class); /** - * Parameter to specify the multiplier for the initial number of medoids, must - * be an integer greater than 0. - * <p> - * Default value: {@code 10} - * </p> - * <p> - * Key: {@code -proclus.mi} - * </p> - */ - public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids."); - - /** - * Holds the value of {@link #M_I_ID}. + * Multiplier for the initial number of medoids. */ private int m_i; @@ -145,27 +140,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param database Database to process * @param relation Relation to process */ - public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { - DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); - RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); + public Clustering<SubspaceModel> run(Database database, Relation<V> relation) { + DistanceQuery<V> distFunc = this.getDistanceQuery(database); + RangeQuery<V> rangeQuery = database.getRangeQuery(distFunc); final Random random = rnd.getSingleThreadedRandom(); - if (RelationUtil.dimensionality(relation) < l) { + if(RelationUtil.dimensionality(relation) < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")"); } // TODO: use a StepProgress! // initialization phase - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("1. Initialization phase..."); } int sampleSize = Math.min(relation.size(), k_i * k); DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random.nextLong()); int medoidSize = Math.min(relation.size(), m_i * k); - DBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random); + ArrayDBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("sampleSize ").append(sampleSize).append('\n'); @@ -176,15 +171,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } // iterative phase - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("2. Iterative phase..."); } double bestObjective = Double.POSITIVE_INFINITY; - ModifiableDBIDs m_best = null; - ModifiableDBIDs m_bad = null; - ModifiableDBIDs m_current = initialSet(medoids, k, random); + ArrayDBIDs m_best = null; + DBIDs m_bad = null; + ArrayDBIDs m_current = initialSet(medoids, k, random); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("m_c ").append(m_current).append('\n'); @@ -193,47 +188,44 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null; - // TODO: Use DataStore and Trove for performance - Map<DBID, PROCLUSCluster> clusters = null; + ArrayList<PROCLUSCluster> clusters = null; int loops = 0; - while (loops < 10) { - Map<DBID, TIntSet> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); - clusters = assignPoints(dimensions, relation); + while(loops < 10) { + TIntSet[] dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); + clusters = assignPoints(m_current, dimensions, relation); double objectiveFunction = evaluateClusters(clusters, dimensions, relation); - if (objectiveFunction < bestObjective) { + if(objectiveFunction < bestObjective) { // restart counting loops loops = 0; bestObjective = objectiveFunction; m_best = m_current; - m_bad = computeBadMedoids(clusters, (int) (relation.size() * 0.1 / k)); + m_bad = computeBadMedoids(m_current, clusters, (int) (relation.size() * 0.1 / k)); } m_current = computeM_current(medoids, m_best, m_bad, random); loops++; - if (cprogress != null) { + if(cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } } - if (cprogress != null) { - cprogress.setCompleted(LOG); - } + LOG.setCompleted(cprogress); // refinement phase - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("3. Refinement phase..."); } - List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<>(clusters.values()), relation); + List<Pair<Vector, TIntSet>> dimensions = findDimensions(clusters, relation); List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation); // build result int numClusters = 1; - Clustering<SubspaceModel<V>> result = new Clustering<>("ProClus clustering", "proclus-clustering"); - for (PROCLUSCluster c : finalClusters) { - Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.objectIDs); - cluster.setModel(new SubspaceModel<>(new Subspace(c.getDimensions()), c.centroid)); + Clustering<SubspaceModel> result = new Clustering<>("ProClus clustering", "proclus-clustering"); + for(PROCLUSCluster c : finalClusters) { + Cluster<SubspaceModel> cluster = new Cluster<>(c.objectIDs); + cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid)); cluster.setName("cluster_" + numClusters++); result.addToplevelCluster(cluster); @@ -250,48 +242,64 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param random random number generator * @return a piercing set of m medoids from the specified sample set */ - private ModifiableDBIDs greedy(DistanceQuery<V, DoubleDistance> distFunc, DBIDs sampleSet, int m, Random random) { + private ArrayDBIDs greedy(DistanceQuery<V> distFunc, DBIDs sampleSet, int m, Random random) { + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(m); + ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet); - ModifiableDBIDs medoids = DBIDUtil.newHashSet(); + DBIDArrayIter iter = s.iter(); + DBIDVar m_i = DBIDUtil.newVar(); + int size = s.size(); // m_1 is random point of S - DBID m_i = s.remove(random.nextInt(s.size())); + final int r = random.nextInt(size); + s.assignVar(r, m_i); medoids.add(m_i); - if (LOG.isDebugging()) { - LOG.debugFiner("medoids " + medoids); + if(LOG.isDebugging()) { + LOG.debugFiner("medoids " + medoids.toString()); } + // Remove m_i from candidates, by moving to the end. + s.swap(r, size - 1); + --size; + + // To track the current worst element: + int worst = -1; + double worstd = Double.NEGATIVE_INFINITY; // compute distances between each point in S and m_i - // FIXME: don't use maps, so we can work with DBIDRef - Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<>(); - for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { - DBID id = DBIDUtil.deref(iter); - DoubleDistance dist = distFunc.distance(id, m_i); - distances.put(id, DBIDUtil.newDistancePair(dist, id)); + WritableDoubleDataStore distances = DataStoreUtil.makeDoubleStorage(s, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + for(iter.seek(0); iter.getOffset() < size; iter.advance()) { + final double dist = distFunc.distance(iter, m_i); + distances.putDouble(iter, dist); + if(dist > worstd) { + worstd = dist; + worst = iter.getOffset(); + } } - for (int i = 1; i < m; i++) { + for(int i = 1; i < m; i++) { // choose medoid m_i to be far from previous medoids - List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<>(distances.values()); - DistanceDBIDResultUtil.sortByDistance(d); - - m_i = DBIDUtil.deref(d.get(d.size() - 1)); + s.assignVar(worst, m_i); medoids.add(m_i); - s.remove(m_i); - distances.remove(m_i); - - // compute distances of each point to closest medoid - for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { - DBID id = DBIDUtil.deref(iter); - DoubleDistance dist_new = distFunc.distance(id, m_i); - DoubleDistance dist_old = distances.get(id).getDistance(); - - DoubleDistance dist = dist_new.compareTo(dist_old) < 0 ? dist_new : dist_old; - distances.put(id, DBIDUtil.newDistancePair(dist, id)); + // Remove m_i from candidates, by moving to the end. + s.swap(worst, size - 1); + --size; + + // compute distances of each point to closest medoid; track worst. + worst = -1; + worstd = Double.NEGATIVE_INFINITY; + for(iter.seek(0); iter.getOffset() < size; iter.advance()) { + double dist_new = distFunc.distance(iter, m_i); + double dist_old = distances.doubleValue(iter); + double dist = (dist_new < dist_old) ? dist_new : dist_old; + distances.putDouble(iter, dist); + if(dist > worstd) { + worstd = dist; + worst = iter.getOffset(); + } } - if (LOG.isDebugging()) { - LOG.debugFiner("medoids " + medoids); + if(LOG.isDebugging()) { + LOG.debugFiner("medoids " + medoids.toString()); } } @@ -306,14 +314,8 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param random random number generator * @return a set of k elements from the specified sample set */ - private ModifiableDBIDs initialSet(DBIDs sampleSet, int k, Random random) { - ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet); - ModifiableDBIDs initialSet = DBIDUtil.newHashSet(); - while (initialSet.size() < k) { - DBID next = s.remove(random.nextInt(s.size())); - initialSet.add(next); - } - return initialSet; + private ArrayDBIDs initialSet(DBIDs sampleSet, int k, Random random) { + return DBIDUtil.ensureArray(DBIDUtil.randomSample(sampleSet, k, random)); } /** @@ -325,21 +327,21 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param random random number generator * @return m_current, the set of medoids in current iteration */ - private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) { + private ArrayDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) { ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m); m_list.removeDBIDs(m_best); - ModifiableDBIDs m_current = DBIDUtil.newHashSet(); - for (DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { - DBID m_i = DBIDUtil.deref(iter); - if (m_bad.contains(m_i)) { + ArrayModifiableDBIDs m_current = DBIDUtil.newArray(); + for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { + if(m_bad.contains(iter)) { int currentSize = m_current.size(); - while (m_current.size() == currentSize) { + while(m_current.size() == currentSize) { DBID next = m_list.remove(random.nextInt(m_list.size())); m_current.add(next); } - } else { - m_current.add(m_i); + } + else { + m_current.add(iter); } } @@ -357,29 +359,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param distFunc the distance function * @return a mapping of the medoid's id to its locality */ - private Map<DBID, DistanceDBIDList<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { - Map<DBID, DistanceDBIDList<DoubleDistance>> result = new HashMap<>(); + private DataStore<DoubleDBIDList> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) { + WritableDataStore<DoubleDBIDList> result = DataStoreUtil.makeStorage(medoids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, DoubleDBIDList.class); - for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m = DBIDUtil.deref(iter); + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { // determine minimum distance between current medoid m and any other // medoid m_i - DoubleDistance minDist = null; - for (DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { - DBID m_i = DBIDUtil.deref(iter2); - if (DBIDUtil.equal(m_i, m)) { + double minDist = Double.POSITIVE_INFINITY; + for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { + if(DBIDUtil.equal(iter, iter2)) { continue; } - DoubleDistance currentDist = distFunc.distance(m, m_i); - if (minDist == null || currentDist.compareTo(minDist) < 0) { + double currentDist = distFunc.distance(iter, iter2); + if(currentDist < minDist) { minDist = currentDist; } } // determine points in sphere centered at m with radius minDist - assert minDist != null; - DistanceDBIDList<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist); - result.put(m, qr); + assert minDist != Double.POSITIVE_INFINITY; + DoubleDBIDList qr = rangeQuery.getRangeForDBID(iter, minDist); + result.put(iter, qr); } return result; @@ -395,68 +395,67 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the set of correlated dimensions for each medoid in the specified * medoid set */ - private Map<DBID, TIntSet> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { + private TIntSet[] findDimensions(ArrayDBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) { // get localities - Map<DBID, DistanceDBIDList<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery); + DataStore<DoubleDBIDList> localities = getLocalities(medoids, database, distFunc, rangeQuery); // compute x_ij = avg distance from points in l_i to medoid m_i int dim = RelationUtil.dimensionality(database); - Map<DBID, double[]> averageDistances = new HashMap<>(); + double[][] averageDistances = new double[medoids.size()][]; - for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m_i = DBIDUtil.deref(iter); - V medoid_i = database.get(m_i); - DistanceDBIDList<DoubleDistance> l_i = localities.get(m_i); + int i = 0; + for(DBIDArrayIter iter = medoids.iter(); iter.valid(); iter.advance(), i++) { + V medoid_i = database.get(iter); + DoubleDBIDList l_i = localities.get(iter); double[] x_i = new double[dim]; - for (DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { + for(DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { V o = database.get(qr); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] += Math.abs(medoid_i.doubleValue(d) - o.doubleValue(d)); } } - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] /= l_i.size(); } - averageDistances.put(m_i, x_i); + averageDistances[i] = x_i; } - Map<DBID, TIntSet> dimensionMap = new HashMap<>(); - List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<>(); - for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m_i = DBIDUtil.deref(iter); + TIntSet[] dimensionMap = new TIntSet[medoids.size()]; + List<DoubleIntInt> z_ijs = new ArrayList<>(); + for(i = 0; i < medoids.size(); i++) { TIntSet dims_i = new TIntHashSet(); - dimensionMap.put(m_i, dims_i); + dimensionMap[i] = dims_i; - double[] x_i = averageDistances.get(m_i); + double[] x_i = averageDistances[i]; // y_i double y_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { y_i += x_i[j]; } y_i /= dim; // sigma_i double sigma_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { double diff = x_i[j] - y_i; sigma_i += diff * diff; } sigma_i /= (dim - 1); sigma_i = Math.sqrt(sigma_i); - for (int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, m_i, j)); + for(int j = 0; j < dim; j++) { + z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j)); } } Collections.sort(z_ijs); int max = Math.max(k * l, 2); - for (int m = 0; m < max; m++) { - CTriple<Double, DBID, Integer> z_ij = z_ijs.get(m); - TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); - dims_i.add(z_ij.getThird()); + for(int m = 0; m < max; m++) { + DoubleIntInt z_ij = z_ijs.get(m); + TIntSet dims_i = dimensionMap[z_ij.dimi]; + dims_i.add(z_ij.dimj); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("z_ij ").append(z_ij).append('\n'); @@ -476,64 +475,65 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the set of correlated dimensions for each specified cluster * centroid */ - private List<Pair<V, TIntSet>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) { + private List<Pair<Vector, TIntSet>> findDimensions(ArrayList<PROCLUSCluster> clusters, Relation<V> database) { // compute x_ij = avg distance from points in c_i to c_i.centroid int dim = RelationUtil.dimensionality(database); - Map<Integer, double[]> averageDistances = new HashMap<>(); + final int numc = clusters.size(); + double[][] averageDistances = new double[numc][]; - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < numc; i++) { PROCLUSCluster c_i = clusters.get(i); double[] x_i = new double[dim]; - for (DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] += Math.abs(c_i.centroid.doubleValue(d) - o.doubleValue(d)); } } - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] /= c_i.objectIDs.size(); } - averageDistances.put(i, x_i); + averageDistances[i] = x_i; } - List<CTriple<Double, Integer, Integer>> z_ijs = new ArrayList<>(); - for (int i = 0; i < clusters.size(); i++) { - double[] x_i = averageDistances.get(i); + List<DoubleIntInt> z_ijs = new ArrayList<>(); + for(int i = 0; i < numc; i++) { + double[] x_i = averageDistances[i]; // y_i double y_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { y_i += x_i[j]; } y_i /= dim; // sigma_i double sigma_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { double diff = x_i[j] - y_i; sigma_i += diff * diff; } sigma_i /= (dim - 1); sigma_i = Math.sqrt(sigma_i); - for (int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, i, j)); + for(int j = 0; j < dim; j++) { + z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j)); } } Collections.sort(z_ijs); // mapping cluster index -> dimensions - Map<Integer, TIntSet> dimensionMap = new HashMap<>(); + TIntSet[] dimensionMap = new TIntSet[numc]; int max = Math.max(k * l, 2); - for (int m = 0; m < max; m++) { - CTriple<Double, Integer, Integer> z_ij = z_ijs.get(m); - TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); - if (dims_i == null) { + for(int m = 0; m < max; m++) { + DoubleIntInt z_ij = z_ijs.get(m); + TIntSet dims_i = dimensionMap[z_ij.dimi]; + if(dims_i == null) { dims_i = new TIntHashSet(); - dimensionMap.put(z_ij.getSecond(), dims_i); + dimensionMap[z_ij.dimi] = dims_i; } - dims_i.add(z_ij.getThird()); + dims_i.add(z_ij.dimj); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("z_ij ").append(z_ij).append('\n'); @@ -543,9 +543,12 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } // mapping cluster -> dimensions - List<Pair<V, TIntSet>> result = new ArrayList<>(); - for (int i : dimensionMap.keySet()) { - TIntSet dims_i = dimensionMap.get(i); + List<Pair<Vector, TIntSet>> result = new ArrayList<>(); + for(int i = 0; i < numc; i++) { + TIntSet dims_i = dimensionMap[i]; + if(dims_i == null) { + continue; + } PROCLUSCluster c_i = clusters.get(i); result.add(new Pair<>(c_i.centroid, dims_i)); } @@ -555,45 +558,51 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster /** * Assigns the objects to the clusters. * + * @param m_current Current centers * @param dimensions set of correlated dimensions for each medoid of the * cluster * @param database the database containing the objects * @return the assignments of the object to the clusters */ - private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, TIntSet> dimensions, Relation<V> database) { - Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<>(); - for (DBID m_i : dimensions.keySet()) { - clusterIDs.put(m_i, DBIDUtil.newHashSet()); - } - - for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { - DBID p_id = DBIDUtil.deref(it); - V p = database.get(p_id); - DistanceDBIDPair<DoubleDistance> minDist = null; - for (DBID m_i : dimensions.keySet()) { + private ArrayList<PROCLUSCluster> assignPoints(ArrayDBIDs m_current, TIntSet[] dimensions, Relation<V> database) { + ModifiableDBIDs[] clusterIDs = new ModifiableDBIDs[dimensions.length]; + for(int i = 0; i < m_current.size(); i++) { + clusterIDs[i] = DBIDUtil.newHashSet(); + } + + DBIDArrayIter m_i = m_current.iter(); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V p = database.get(it); + double minDist = Double.NaN; + int best = -1, i = 0; + for(m_i.seek(0); m_i.valid(); m_i.advance(), i++) { V m = database.get(m_i); - DistanceDBIDPair<DoubleDistance> currentDist = DBIDUtil.newDistancePair(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i); - if (minDist == null || currentDist.compareByDistance(minDist) < 0) { + double currentDist = manhattanSegmentalDistance(p, m, dimensions[i]); + if(!(minDist <= currentDist)) { minDist = currentDist; + best = i; } } // add p to cluster with mindist - assert minDist != null; - ModifiableDBIDs ids = clusterIDs.get(DBIDUtil.deref(minDist)); - ids.add(p_id); + assert best >= 0; + ModifiableDBIDs ids = clusterIDs[best]; + ids.add(it); } - Map<DBID, PROCLUSCluster> clusters = new HashMap<>(); - for (DBID m_i : dimensions.keySet()) { - ModifiableDBIDs objectIDs = clusterIDs.get(m_i); - if (!objectIDs.isEmpty()) { - TIntSet clusterDimensions = dimensions.get(m_i); - V centroid = Centroid.make(database, objectIDs).toVector(database); - clusters.put(m_i, new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); + ArrayList<PROCLUSCluster> clusters = new ArrayList<>(m_current.size()); + for(int i = 0; i < dimensions.length; i++) { + ModifiableDBIDs objectIDs = clusterIDs[i]; + if(!objectIDs.isEmpty()) { + TIntSet clusterDimensions = dimensions[i]; + Vector centroid = Centroid.make(database, objectIDs); + clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); + } + else { + clusters.add(null); } } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("clusters ").append(clusters).append('\n'); @@ -610,42 +619,43 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param database the database containing the objects * @return the assignments of the object to the clusters */ - private List<PROCLUSCluster> finalAssignment(List<Pair<V, TIntSet>> dimensions, Relation<V> database) { + private List<PROCLUSCluster> finalAssignment(List<Pair<Vector, TIntSet>> dimensions, Relation<V> database) { Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>(); - for (int i = 0; i < dimensions.size(); i++) { + for(int i = 0; i < dimensions.size(); i++) { clusterIDs.put(i, DBIDUtil.newHashSet()); } - for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { - DBID p_id = DBIDUtil.deref(it); - V p = database.get(p_id); - Pair<DoubleDistance, Integer> minDist = null; - for (int i = 0; i < dimensions.size(); i++) { - Pair<V, TIntSet> pair_i = dimensions.get(i); - V c_i = pair_i.first; + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V p = database.get(it); + double minDist = Double.POSITIVE_INFINITY; + int best = -1; + for(int i = 0; i < dimensions.size(); i++) { + Pair<Vector, TIntSet> pair_i = dimensions.get(i); + Vector c_i = pair_i.first; TIntSet dimensions_i = pair_i.second; - DoubleDistance currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i); - if (minDist == null || currentDist.compareTo(minDist.first) < 0) { - minDist = new Pair<>(currentDist, i); + double currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i); + if(best < 0 || currentDist < minDist) { + minDist = currentDist; + best = i; } } // add p to cluster with mindist - assert minDist != null; - ModifiableDBIDs ids = clusterIDs.get(minDist.second); - ids.add(p_id); + assert minDist >= 0.; + ModifiableDBIDs ids = clusterIDs.get(best); + ids.add(it); } List<PROCLUSCluster> clusters = new ArrayList<>(); - for (int i = 0; i < dimensions.size(); i++) { + for(int i = 0; i < dimensions.size(); i++) { ModifiableDBIDs objectIDs = clusterIDs.get(i); - if (!objectIDs.isEmpty()) { + if(!objectIDs.isEmpty()) { TIntSet clusterDimensions = dimensions.get(i).second; - V centroid = Centroid.make(database, objectIDs).toVector(database); + Vector centroid = Centroid.make(database, objectIDs); clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); } } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("clusters ").append(clusters).append('\n'); @@ -664,14 +674,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the Manhattan segmental distance between o1 and o2 relative to the * specified dimensions */ - private DoubleDistance manhattanSegmentalDistance(V o1, V o2, TIntSet dimensions) { + private double manhattanSegmentalDistance(NumberVector o1, NumberVector o2, TIntSet dimensions) { double result = 0; - for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { final int d = iter.next(); result += Math.abs(o1.doubleValue(d) - o2.doubleValue(d)); } result /= dimensions.size(); - return new DoubleDistance(result); + return result; } /** @@ -682,20 +692,20 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param database the database holding the objects * @return a measure for the cluster quality */ - private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, TIntSet> dimensions, Relation<V> database) { + private double evaluateClusters(ArrayList<PROCLUSCluster> clusters, TIntSet[] dimensions, Relation<V> database) { double result = 0; - for (DBID m_i : clusters.keySet()) { - PROCLUSCluster c_i = clusters.get(m_i); - V centroid_i = c_i.centroid; + for(int i = 0; i < dimensions.length; i++) { + PROCLUSCluster c_i = clusters.get(i); + Vector centroid_i = c_i.centroid; - TIntSet dims_i = dimensions.get(m_i); + TIntSet dims_i = dimensions[i]; double w_i = 0; - for (TIntIterator iter = dims_i.iterator(); iter.hasNext();) { + for(TIntIterator iter = dims_i.iterator(); iter.hasNext();) { final int j = iter.next(); w_i += avgDistance(centroid_i, c_i.objectIDs, database, j); } - w_i /= dimensions.keySet().size(); + w_i /= dimensions.length; result += c_i.objectIDs.size() * w_i; } @@ -713,9 +723,9 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the average distance of the objects to the centroid along the * specified dimension */ - private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) { + private double avgDistance(Vector centroid, DBIDs objectIDs, Relation<V> database, int dimension) { Mean avg = new Mean(); - for (DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); avg.put(Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension))); } @@ -726,16 +736,18 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * Computes the bad medoids, where the medoid of a cluster with less than the * specified threshold of objects is bad. * + * @param m_current Current medoids * @param clusters the clusters * @param threshold the threshold * @return the bad medoids */ - private ModifiableDBIDs computeBadMedoids(Map<DBID, PROCLUSCluster> clusters, int threshold) { - ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(); - for (DBID m_i : clusters.keySet()) { - PROCLUSCluster c_i = clusters.get(m_i); - if (c_i.objectIDs.size() < threshold) { - badMedoids.add(m_i); + private DBIDs computeBadMedoids(ArrayDBIDs m_current, ArrayList<PROCLUSCluster> clusters, int threshold) { + ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(m_current.size()); + int i = 0; + for(DBIDIter it = m_current.iter(); it.valid(); it.advance(), i++) { + PROCLUSCluster c_i = clusters.get(i); + if(c_i == null || c_i.objectIDs.size() < threshold) { + badMedoids.add(it); } } return badMedoids; @@ -752,6 +764,36 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } /** + * Simple triple. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private static class DoubleIntInt implements Comparable<DoubleIntInt> { + protected double first; + + protected int dimi, dimj; + + public DoubleIntInt(double first, int second, int third) { + this.first = first; + this.dimi = second; + this.dimj = third; + } + + @Override + public int compareTo(DoubleIntInt o) { + if(this.first < o.first) { + return -1; + } + if(this.first > o.first) { + return +1; + } + return 0; + } + } + + /** * Encapsulates the attributes of a cluster. * * @apiviz.exclude @@ -770,16 +812,16 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster /** * The centroids of this cluster along each dimension. */ - V centroid; + Vector centroid; /** - * Provides a new cluster with the specified parameters. + * Constructor. * * @param objectIDs the ids of the objects belonging to this cluster * @param dimensions the correlated dimensions of this cluster * @param centroid the centroid of this cluster */ - public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, V centroid) { + public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, Vector centroid) { this.objectIDs = objectIDs; this.dimensions = dimensions; this.centroid = centroid; @@ -790,10 +832,11 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster StringBuilder result = new StringBuilder(); result.append("Dimensions: ["); boolean notFirst = false; - for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { - if (notFirst) { + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + if(notFirst) { result.append(','); - } else { + } + else { notFirst = true; } result.append(iter.next()); @@ -809,10 +852,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * * @return the correlated dimensions of this cluster as BitSet */ - public BitSet getDimensions() { - BitSet result = new BitSet(); - for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { - result.set(iter.next()); + public long[] getDimensions() { + int maxdim = 0; + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + int d = iter.next(); + maxdim = (d > maxdim) ? d : maxdim; + } + long[] result = BitsUtil.zero(maxdim); + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + BitsUtil.setI(result, iter.next()); } return result; } @@ -825,14 +873,32 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedClustering.Parameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractProjectedClustering.Parameterizer { + /** + * Parameter to specify the multiplier for the initial number of medoids, + * must be an integer greater than 0. + * <p> + * Default value: {@code 10} + * </p> + * <p> + * Key: {@code -proclus.mi} + * </p> + */ + public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids."); + /** * Parameter to specify the random generator seed. */ public static final OptionID SEED_ID = new OptionID("proclus.seed", "The random number generator seed."); + /** + * Multiplier for the initial number of medoids. + */ protected int m_i = -1; + /** + * Random generator + */ protected RandomFactory rnd; @Override @@ -843,14 +909,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster configKI(config); configL(config); - IntParameter m_iP = new IntParameter(M_I_ID, 10); - m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); - if (config.grab(m_iP)) { + IntParameter m_iP = new IntParameter(M_I_ID, 10) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(m_iP)) { m_i = m_iP.getValue(); } RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java index 4e670974..a61935b0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,42 +23,50 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedDBSCAN; -import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.PreDeConCorePredicate; +import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.PreDeConNeighborPredicate; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.Model; -import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.index.preprocessed.subspaceproj.PreDeConSubspaceIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** - * <p/> * PreDeCon computes clusters of subspace preference weighted connected points. * The algorithm searches for local subgroups of a set of feature vectors having * a low variance along one or more (but not all) attributes. - * </p> - * <p/> - * Reference: <br> - * C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger: Density Connected Clustering - * with Local Subspace Preferences. <br> + * + * Reference: + * <p> + * C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger:<br /> + * Density Connected Clustering with Local Subspace Preferences.<br /> * In Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004. * </p> * * @author Peer Kröger * - * @apiviz.uses PreDeConSubspaceIndex + * @apiviz.has PreDeCon.Settings + * @apiviz.composedOf PreDeConNeighborPredicate + * @apiviz.composedOf PreDeConCorePredicate * * @param <V> the type of NumberVector handled by this Algorithm */ @Title("PreDeCon: Subspace Preference weighted Density Connected Clustering") -@Description("PreDeCon computes clusters of subspace preference weighted connected points. " + "The algorithm searches for local subgroups of a set of feature vectors having " + "a low variance along one or more (but not all) attributes.") -@Reference(authors = "C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density Connected Clustering with Local Subspace Preferences", booktitle = "Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004", url = "http://dx.doi.org/10.1109/ICDM.2004.10087") -public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN<Clustering<Model>, V> { +@Description("PreDeCon computes clusters of subspace preference weighted connected points. "// + + "The algorithm searches for local subgroups of a set of feature vectors having " + "a low variance along one or more (but not all) attributes.") +@Reference(authors = "C. Böhm, K. Kailing, H.-P. Kriegel, P. Kröger", // +title = "Density Connected Clustering with Local Subspace Preferences", // +booktitle = "Proc. 4th IEEE Int. Conf. on Data Mining (ICDM'04), Brighton, UK, 2004", // +url = "http://dx.doi.org/10.1109/ICDM.2004.10087") +public class PreDeCon<V extends NumberVector> extends GeneralizedDBSCAN { /** * The logger for this class. */ @@ -67,28 +75,165 @@ public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN /** * Constructor. * - * @param epsilon Epsilon value - * @param minpts MinPts value - * @param distanceFunction outer distance function - * @param lambda Lambda value + * @param settings PreDeCon settings. */ - public PreDeCon(DoubleDistance epsilon, int minpts, LocallyWeightedDistanceFunction<V> distanceFunction, int lambda) { - super(epsilon, minpts, distanceFunction, lambda); + public PreDeCon(PreDeCon.Settings settings) { + super(new PreDeConNeighborPredicate<>(settings), new PreDeConCorePredicate(settings), false); } @Override - public String getLongResultName() { - return "PreDeCon Clustering"; + protected Logging getLogger() { + return LOG; } - @Override - public String getShortResultName() { - return "predecon-clustering"; - } + /** + * Class containing all the PreDeCon settings. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Settings { + /** + * Query radius parameter epsilon. + */ + public double epsilon; - @Override - protected Logging getLogger() { - return LOG; + /** + * The threshold for small eigenvalues. + */ + public double delta; + + /** + * The kappa penality factor for deviations in preferred dimensions. + */ + public double kappa = Parameterizer.KAPPA_DEFAULT; + + /** + * DBSCAN Minpts parameter, aka "mu". + */ + public int minpts; + + /** + * Lambda: Maximum subspace dimensionality. + */ + public int lambda = Integer.MAX_VALUE; + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter Delta: maximum variance allowed + */ + public static final OptionID DELTA_ID = new OptionID("predecon.delta", "A double specifying the variance threshold for small Eigenvalues."); + + /** + * Parameter Kappa: penalty for deviations in preferred dimensions. + */ + public static final OptionID KAPPA_ID = new OptionID("predecon.kappa", "Penalty factor for deviations in preferred (low-variance) dimensions."); + + /** + * Default for kappa parameter. + */ + public static final double KAPPA_DEFAULT = 20.; + + /** + * Parameter Lambda: maximum dimensionality allowed. + */ + public static final OptionID LAMBDA_ID = new OptionID("predecon.lambda", "Maximum dimensionality to consider for core points."); + + /** + * Settings to build. + */ + Settings settings; + + @Override + public void makeOptions(Parameterization config) { + settings = new Settings(); + configEpsilon(config); + configMinPts(config); + configDelta(config); + configKappa(config); + configLambda(config); + } + + /** + * Configure the epsilon radius parameter. + * + * @param config Parameter source + */ + protected void configEpsilon(Parameterization config) { + DoubleParameter epsilonP = new DoubleParameter(DBSCAN.Parameterizer.EPSILON_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(epsilonP)) { + settings.epsilon = epsilonP.doubleValue(); + } + } + + /** + * Configure the minPts aka "mu" parameter. + * + * @param config Parameter source + */ + protected void configMinPts(Parameterization config) { + IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minptsP)) { + settings.minpts = minptsP.intValue(); + } + } + + /** + * Configure the delta parameter. + * + * @param config Parameter source + */ + protected void configDelta(Parameterization config) { + DoubleParameter deltaP = new DoubleParameter(DELTA_ID) // + .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + if(config.grab(deltaP)) { + settings.delta = deltaP.doubleValue(); + } + } + + /** + * Configure the kappa parameter. + * + * @param config Parameter source + */ + protected void configKappa(Parameterization config) { + DoubleParameter kappaP = new DoubleParameter(KAPPA_ID) // + .addConstraint(CommonConstraints.GREATER_THAN_ONE_DOUBLE) // + .setDefaultValue(KAPPA_DEFAULT); + if(config.grab(kappaP)) { + settings.kappa = kappaP.doubleValue(); + } + } + + /** + * Configure the delta parameter. + * + * @param config Parameter source + */ + protected void configLambda(Parameterization config) { + IntParameter lambdaP = new IntParameter(LAMBDA_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT) // + .setOptional(true); + if(config.grab(lambdaP)) { + settings.lambda = lambdaP.intValue(); + } + } + + @Override + public Settings makeInstance() { + return settings; + } + } } /** @@ -98,20 +243,20 @@ public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedDBSCAN.Parameterizer<V, DoubleDistance> { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * PreDeConSettings. + */ + protected PreDeCon.Settings settings; + @Override protected void makeOptions(Parameterization config) { - super.makeOptions(config); - configInnerDistance(config); - configEpsilon(config, innerdist); - configMinPts(config); - configOuterDistance(config, epsilon, minpts, PreDeConSubspaceIndex.Factory.class, innerdist); - configLambda(config); + settings = config.tryInstantiate(PreDeCon.Settings.class); } @Override protected PreDeCon<V> makeInstance() { - return new PreDeCon<>(epsilon, minpts, outerdist, lambda); + return new PreDeCon<>(settings); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index e6245f6e..d6cf40fd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; */ import java.util.ArrayList; -import java.util.BitSet; import java.util.HashMap; import java.util.List; import java.util.TreeMap; @@ -45,10 +44,10 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionSelectingSubspaceDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -56,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -69,8 +68,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * </p> * <p> * Reference: <br> - * K. Kailing, H.-P. Kriegel, P. Kroeger: Density connected Subspace Clustering - * for High Dimensional Data. <br> + * K. Kailing, H.-P. Kriegel, P. Kröger:<br /> + * Density connected Subspace Clustering for High Dimensional Data<br /> * In Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004. * </p> * @@ -83,9 +82,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @param <V> the type of FeatureVector handled by this Algorithm */ @Title("SUBCLU: Density connected Subspace Clustering") -@Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.") -@Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004") -public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +@Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. "// + + "SUBCLU delivers for each subspace the same clusters DBSCAN would have found, "// + + "when applied to this subspace seperately.") +@Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", // +title = "Density connected Subspace Clustering for High Dimensional Data", // +booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004", // +url = "http://www.siam.org/meetings/sdm04/proceedings/sdm04_023.pdf") +public class SUBCLU<V extends NumberVector> extends AbstractAlgorithm<Clustering<SubspaceModel>> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** * The logger for this class. */ @@ -125,12 +129,12 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * Holds the instance of the distance function specified by * {@link #DISTANCE_FUNCTION_ID}. */ - private DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction; + private DimensionSelectingSubspaceDistanceFunction<V> distanceFunction; /** * Holds the value of {@link #EPSILON_ID}. */ - private DoubleDistance epsilon; + private double epsilon; /** * Holds the value of {@link #MINPTS_ID}. @@ -140,7 +144,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster /** * Holds the result; */ - private Clustering<SubspaceModel<V>> result; + private Clustering<SubspaceModel> result; /** * Constructor. @@ -149,7 +153,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * @param epsilon Epsilon value * @param minpts Minpts value */ - public SUBCLU(DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction, DoubleDistance epsilon, int minpts) { + public SUBCLU(DimensionSelectingSubspaceDistanceFunction<V> distanceFunction, double epsilon, int minpts) { super(); this.distanceFunction = distanceFunction; this.epsilon = epsilon; @@ -162,15 +166,13 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * @param relation Relation to process * @return Clustering result */ - public Clustering<SubspaceModel<V>> run(Relation<V> relation) { + public Clustering<SubspaceModel> run(Relation<V> relation) { final int dimensionality = RelationUtil.dimensionality(relation); StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters - if (stepprog != null) { - stepprog.beginStep(1, "Generate all 1-dimensional clusters.", LOG); - } + LOG.beginStep(stepprog, 1, "Generate all 1-dimensional clusters."); // mapping of dimensionality to set of subspaces HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>(); @@ -182,35 +184,35 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster // mapping of subspaces to list of clusters TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator()); - for (int d = 0; d < dimensionality; d++) { + for(int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); - if (LOG.isDebuggingFiner()) { + if(LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); msg.append('\n').append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n"); - for (Cluster<Model> cluster : clusters) { + for(Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } LOG.debugFiner(msg.toString()); } - if (!clusters.isEmpty()) { + if(!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters - for (int d = 0; d < dimensionality - 1; d++) { - if (stepprog != null) { + for(int d = 0; d < dimensionality - 1; d++) { + if(stepprog != null) { stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } List<Subspace> subspaces = subspaceMap.get(d); - if (subspaces == null || subspaces.isEmpty()) { - if (stepprog != null) { - for (int dim = d + 1; dim < dimensionality - 1; dim++) { + if(subspaces == null || subspaces.isEmpty()) { + if(stepprog != null) { + for(int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } @@ -220,37 +222,37 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster List<Subspace> candidates = generateSubspaceCandidates(subspaces); List<Subspace> s_d = new ArrayList<>(); - for (Subspace candidate : candidates) { + for(Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); - if (LOG.isDebuggingFine()) { + if(LOG.isDebuggingFine()) { LOG.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); List<Cluster<Model>> clusters = new ArrayList<>(); - for (Cluster<Model> cluster : bestSubspaceClusters) { + for(Cluster<Model> cluster : bestSubspaceClusters) { List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); - if (!candidateClusters.isEmpty()) { + if(!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } - if (LOG.isDebuggingFine()) { + if(LOG.isDebuggingFine()) { StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); - for (Cluster<Model> c : clusters) { + for(Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } LOG.debugFine(msg.toString()); } - if (!clusters.isEmpty()) { + if(!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } - if (!s_d.isEmpty()) { + if(!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } @@ -258,19 +260,17 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster // build result int numClusters = 1; result = new Clustering<>("SUBCLU clustering", "subclu-clustering"); - for (Subspace subspace : clusterMap.descendingKeySet()) { + for(Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); - for (Cluster<Model> cluster : clusters) { - Cluster<SubspaceModel<V>> newCluster = new Cluster<>(cluster.getIDs()); - newCluster.setModel(new SubspaceModel<>(subspace, Centroid.make(relation, cluster.getIDs()).toVector(relation))); + for(Cluster<Model> cluster : clusters) { + Cluster<SubspaceModel> newCluster = new Cluster<>(cluster.getIDs()); + newCluster.setModel(new SubspaceModel(subspace, Centroid.make(relation, cluster.getIDs()))); newCluster.setName("cluster_" + numClusters++); result.addToplevelCluster(newCluster); } } - if (stepprog != null) { - stepprog.setCompleted(LOG); - } + LOG.setCompleted(stepprog); return result; } @@ -279,7 +279,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * * @return the result of the algorithm */ - public Clustering<SubspaceModel<V>> getResult() { + public Clustering<SubspaceModel> getResult() { return result; } @@ -300,7 +300,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster distanceFunction.setSelectedDimensions(subspace.getDimensions()); ProxyDatabase proxy; - if (ids == null) { + if(ids == null) { // TODO: in this case, we might want to use an index - the proxy below // will prevent this! ids = relation.getDBIDs(); @@ -308,9 +308,9 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster proxy = new ProxyDatabase(ids, relation); - DBSCAN<V, DoubleDistance> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts); + DBSCAN<V> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts); // run DBSCAN - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); } Clustering<Model> dbsres = dbscan.run(proxy); @@ -318,8 +318,8 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster // separate cluster and noise List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters(); List<Cluster<Model>> clusters = new ArrayList<>(); - for (Cluster<Model> c : clusterAndNoise) { - if (!c.isNoise()) { + for(Cluster<Model> c : clusterAndNoise) { + if(!c.isNoise()) { clusters.add(c); } } @@ -336,7 +336,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) { List<Subspace> candidates = new ArrayList<>(); - if (subspaces.isEmpty()) { + if(subspaces.isEmpty()) { return candidates; } @@ -344,46 +344,46 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster int d = subspaces.get(0).dimensionality(); StringBuilder msgFine = new StringBuilder("\n"); - if (LOG.isDebuggingFiner()) { + if(LOG.isDebuggingFiner()) { msgFine.append("subspaces ").append(subspaces).append('\n'); } - for (int i = 0; i < subspaces.size(); i++) { + for(int i = 0; i < subspaces.size(); i++) { Subspace s1 = subspaces.get(i); - for (int j = i + 1; j < subspaces.size(); j++) { + for(int j = i + 1; j < subspaces.size(); j++) { Subspace s2 = subspaces.get(j); Subspace candidate = s1.join(s2); - if (candidate != null) { - if (LOG.isDebuggingFiner()) { + if(candidate != null) { + if(LOG.isDebuggingFiner()) { msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n'); } // prune irrelevant candidate subspaces List<Subspace> lowerSubspaces = lowerSubspaces(candidate); - if (LOG.isDebuggingFiner()) { + if(LOG.isDebuggingFiner()) { msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n'); } boolean irrelevantCandidate = false; - for (Subspace s : lowerSubspaces) { - if (!subspaces.contains(s)) { + for(Subspace s : lowerSubspaces) { + if(!subspaces.contains(s)) { irrelevantCandidate = true; break; } } - if (!irrelevantCandidate) { + if(!irrelevantCandidate) { candidates.add(candidate); } } } } - if (LOG.isDebuggingFiner()) { + if(LOG.isDebuggingFiner()) { LOG.debugFiner(msgFine.toString()); } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append(d + 1).append("-dimensional candidate subspaces: "); - for (Subspace candidate : candidates) { + for(Subspace candidate : candidates) { msg.append(candidate.dimensonsToString()).append(' '); } LOG.debug(msg.toString()); @@ -401,16 +401,16 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster */ private List<Subspace> lowerSubspaces(Subspace subspace) { int dimensionality = subspace.dimensionality(); - if (dimensionality <= 1) { + if(dimensionality <= 1) { return null; } // order result according to the dimensions List<Subspace> result = new ArrayList<>(); - BitSet dimensions = subspace.getDimensions(); - for (int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) { - BitSet newDimensions = (BitSet) dimensions.clone(); - newDimensions.set(dim, false); + long[] dimensions = subspace.getDimensions(); + for(int dim = BitsUtil.nextSetBit(dimensions, 0); dim >= 0; dim = BitsUtil.nextSetBit(dimensions, dim + 1)) { + long[] newDimensions = dimensions.clone(); + BitsUtil.clearI(newDimensions, dim); result.add(new Subspace(newDimensions)); } @@ -432,14 +432,14 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster private Subspace bestSubspace(List<Subspace> subspaces, Subspace candidate, TreeMap<Subspace, List<Cluster<Model>>> clusterMap) { Subspace bestSubspace = null; - for (Subspace subspace : subspaces) { + for(Subspace subspace : subspaces) { int min = Integer.MAX_VALUE; - if (subspace.isSubspace(candidate)) { + if(subspace.isSubspace(candidate)) { List<Cluster<Model>> clusters = clusterMap.get(subspace); - for (Cluster<Model> cluster : clusters) { + for(Cluster<Model> cluster : clusters) { int clusterSize = cluster.size(); - if (clusterSize < min) { + if(clusterSize < min) { min = clusterSize; bestSubspace = subspace; } @@ -467,29 +467,29 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { protected int minpts = 0; - protected DoubleDistance epsilon = null; + protected double epsilon; - protected DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distance = null; + protected DimensionSelectingSubspaceDistanceFunction<V> distance = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance>> param = new ObjectParameter<>(DISTANCE_FUNCTION_ID, DimensionSelectingSubspaceDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); - if (config.grab(param)) { + ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V>> param = new ObjectParameter<>(DISTANCE_FUNCTION_ID, DimensionSelectingSubspaceDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); + if(config.grab(param)) { distance = param.instantiateClass(config); } - DistanceParameter<DoubleDistance> epsilonP = new DistanceParameter<>(EPSILON_ID, distance); - if (config.grab(epsilonP)) { + DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID); + if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); - if (config.grab(minptsP)) { + if(config.grab(minptsP)) { minpts = minptsP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java index 561816bd..c76c22b8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -34,6 +34,6 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; * * @param <M> Model type */ -public interface SubspaceClusteringAlgorithm<M extends SubspaceModel<?>> extends ClusteringAlgorithm<Clustering<M>> { +public interface SubspaceClusteringAlgorithm<M extends SubspaceModel> extends ClusteringAlgorithm<Clustering<M>> { // No additional constraints } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java new file mode 100644 index 00000000..42f1eaa2 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEInterval.java @@ -0,0 +1,132 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.FormatUtil; + +/** + * Represents an interval in a certain dimension of the data space. + * + * @author Elke Achtert + */ +public class CLIQUEInterval implements Comparable<CLIQUEInterval> { + /** + * The dimension of this interval in the (original) data space. + */ + private int dimension; + + /** + * The minimum (left) value of this interval. + */ + private double min; + + /** + * The maximum (right) value of this interval. + */ + private double max; + + /** + * Creates a new interval with the specified parameters. + * + * @param dimension the dimension of the interval in the original data space + * @param min the minimum (left) value of the interval + * @param max the maximum (right) value of the interval + */ + public CLIQUEInterval(int dimension, double min, double max) { + this.dimension = dimension; + this.min = min; + this.max = max; + } + + /** + * Returns the dimension of the interval in the original data space + * + * @return the dimension of the interval in the original data space + */ + public int getDimension() { + return dimension; + } + + /** + * Returns the minimum (left) value of the interval. + * + * @return the minimum (left) value of the interval + */ + public double getMin() { + return min; + } + + /** + * Returns the maximum (right) value of the interval. + * + * @return the maximum (right) value of the interval + */ + public double getMax() { + return max; + } + + /** + * Returns a string representation of this interval. The string representation + * consists of the dimension and the min and max values of this interval. + * + * @return a string representation of this interval + */ + @Override + public String toString() { + return "d" + (dimension + 1) + "-[" + FormatUtil.NF2.format(min) + "; " + FormatUtil.NF2.format(max) + "["; + } + + /** + * Compares this interval with the specified interval for order. Returns a + * negative integer, zero, or a positive integer as this interval is less + * than, equal to, or greater than the specified interval. First the + * dimensions of the intervals are compared. In case of equality the min + * (left) values are compared. + * + * @param other the interval to be compared + * @return a negative integer, zero, or a positive integer as this object is + * less than, equal to, or greater than the specified object. + */ + @Override + public int compareTo(CLIQUEInterval other) { + if(dimension < other.dimension) { + return -1; + } + if(dimension > other.dimension) { + return 1; + } + + if(min < other.min) { + return -1; + } + if(min > other.min) { + return 1; + } + + if(max != other.max) { + throw new RuntimeException("Should never happen!"); + } + return 0; + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java index 50e3fcd5..4df3e2a5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,16 +24,15 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; */ import java.util.ArrayList; -import java.util.BitSet; import java.util.Collection; import java.util.Comparator; import java.util.List; -import de.lmu.ifi.dbs.elki.data.Interval; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.Subspace; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** @@ -46,7 +45,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param <V> the type of NumberVector this subspace contains */ -public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { +public class CLIQUESubspace<V extends NumberVector> extends Subspace { /** * The dense units belonging to this subspace. */ @@ -74,7 +73,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { * * @param dimensions the dimensions building this subspace */ - public CLIQUESubspace(BitSet dimensions) { + public CLIQUESubspace(long[] dimensions) { super(dimensions); denseUnits = new ArrayList<>(); coverage = 0; @@ -86,9 +85,9 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { * @param unit the unit to be added. */ public void addDenseUnit(CLIQUEUnit<V> unit) { - Collection<Interval> intervals = unit.getIntervals(); - for(Interval interval : intervals) { - if(!getDimensions().get(interval.getDimension())) { + Collection<CLIQUEInterval> intervals = unit.getIntervals(); + for(CLIQUEInterval interval : intervals) { + if(!BitsUtil.get(getDimensions(), interval.getDimension())) { throw new IllegalArgumentException("Unit " + unit + "cannot be added to this subspace, because of wrong dimensions!"); } } @@ -131,7 +130,8 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { unit.markAsAssigned(); model.addDenseUnit(unit); - for(int dim = getDimensions().nextSetBit(0); dim >= 0; dim = getDimensions().nextSetBit(dim + 1)) { + final long[] dims = getDimensions(); + for(int dim = BitsUtil.nextSetBit(dims, 0); dim >= 0; dim = BitsUtil.nextSetBit(dims, dim + 1)) { CLIQUEUnit<V> left = leftNeighbor(unit, dim); if(left != null && !left.isAssigned()) { dfs(left, cluster, model); @@ -152,7 +152,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { * @return the left neighbor of the given unit in the specified dimension */ public CLIQUEUnit<V> leftNeighbor(CLIQUEUnit<V> unit, int dim) { - Interval i = unit.getInterval(Integer.valueOf(dim)); + CLIQUEInterval i = unit.getInterval(dim); for(CLIQUEUnit<V> u : getDenseUnits()) { if(u.containsLeftNeighbor(i)) { @@ -170,7 +170,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { * @return the right neighbor of the given unit in the specified dimension */ public CLIQUEUnit<V> rightNeighbor(CLIQUEUnit<V> unit, Integer dim) { - Interval i = unit.getInterval(dim); + CLIQUEInterval i = unit.getInterval(dim); for(CLIQUEUnit<V> u : getDenseUnits()) { if(u.containsRightNeighbor(i)) { @@ -212,7 +212,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { * @see de.lmu.ifi.dbs.elki.data.Subspace#joinLastDimensions */ public CLIQUESubspace<V> join(CLIQUESubspace<V> other, double all, double tau) { - BitSet dimensions = joinLastDimensions(other); + long[] dimensions = joinLastDimensions(other); if(dimensions == null) { return null; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java index a71b2b67..8e9f139c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,7 +29,6 @@ import java.util.Iterator; import java.util.SortedSet; import java.util.TreeSet; -import de.lmu.ifi.dbs.elki.data.Interval; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -42,21 +41,22 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; * * @author Elke Achtert * + * @apiviz.composedOf CLIQUEInterval * @apiviz.composedOf ModifiableDBIDs * * @param <V> the type of NumberVector this unit contains */ -public class CLIQUEUnit<V extends NumberVector<?>> { +public class CLIQUEUnit<V extends NumberVector> { /** * The one-dimensional intervals of which this unit is build. */ - private SortedSet<Interval> intervals; + private SortedSet<CLIQUEInterval> intervals; /** - * Provides a mapping of particular dimensions to the intervals of which this - * unit is build. + * Mapping of particular dimensions to the intervals of which this unit is + * build. */ - private TIntObjectHashMap<Interval> dimensionToInterval; + private TIntObjectHashMap<CLIQUEInterval> dimensionToInterval; /** * The ids of the feature vectors this unit contains. @@ -74,11 +74,11 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * @param intervals the intervals belonging to this unit * @param ids the ids of the feature vectors belonging to this unit */ - public CLIQUEUnit(SortedSet<Interval> intervals, ModifiableDBIDs ids) { + public CLIQUEUnit(SortedSet<CLIQUEInterval> intervals, ModifiableDBIDs ids) { this.intervals = intervals; dimensionToInterval = new TIntObjectHashMap<>(); - for(Interval interval : intervals) { + for(CLIQUEInterval interval : intervals) { dimensionToInterval.put(interval.getDimension(), interval); } @@ -92,7 +92,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * * @param interval the interval belonging to this unit */ - public CLIQUEUnit(Interval interval) { + public CLIQUEUnit(CLIQUEInterval interval) { intervals = new TreeSet<>(); intervals.add(interval); @@ -113,7 +113,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * vector, false otherwise */ public boolean contains(V vector) { - for(Interval interval : intervals) { + for(CLIQUEInterval interval : intervals) { final double value = vector.doubleValue(interval.getDimension()); if(interval.getMin() > value || value >= interval.getMax()) { return false; @@ -164,7 +164,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * * @return a sorted set of the intervals of which this unit is build */ - public SortedSet<Interval> getIntervals() { + public SortedSet<CLIQUEInterval> getIntervals() { return intervals; } @@ -174,7 +174,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * @param dimension the dimension of the interval to be returned * @return the interval of the specified dimension */ - public Interval getInterval(Integer dimension) { + public CLIQUEInterval getInterval(int dimension) { return dimensionToInterval.get(dimension); } @@ -186,8 +186,8 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * @return true if this unit contains the left neighbor of the specified * interval, false otherwise */ - public boolean containsLeftNeighbor(Interval i) { - Interval interval = dimensionToInterval.get(i.getDimension()); + public boolean containsLeftNeighbor(CLIQUEInterval i) { + CLIQUEInterval interval = dimensionToInterval.get(i.getDimension()); if(interval == null) { return false; } @@ -202,8 +202,8 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * @return true if this unit contains the right neighbor of the specified * interval, false otherwise */ - public boolean containsRightNeighbor(Interval i) { - Interval interval = dimensionToInterval.get(i.getDimension()); + public boolean containsRightNeighbor(CLIQUEInterval i) { + CLIQUEInterval interval = dimensionToInterval.get(i.getDimension()); if(interval == null) { return false; } @@ -246,15 +246,15 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * greater than tau, null otherwise */ public CLIQUEUnit<V> join(CLIQUEUnit<V> other, double all, double tau) { - Interval i1 = this.intervals.last(); - Interval i2 = other.intervals.last(); + CLIQUEInterval i1 = this.intervals.last(); + CLIQUEInterval i2 = other.intervals.last(); if(i1.getDimension() >= i2.getDimension()) { return null; } - Iterator<Interval> it1 = this.intervals.iterator(); - Iterator<Interval> it2 = other.intervals.iterator(); - SortedSet<Interval> resultIntervals = new TreeSet<>(); + Iterator<CLIQUEInterval> it1 = this.intervals.iterator(); + Iterator<CLIQUEInterval> it2 = other.intervals.iterator(); + SortedSet<CLIQUEInterval> resultIntervals = new TreeSet<>(); for(int i = 0; i < this.intervals.size() - 1; i++) { i1 = it1.next(); i2 = it2.next(); @@ -285,7 +285,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { @Override public String toString() { StringBuilder result = new StringBuilder(); - for(Interval interval : intervals) { + for(CLIQUEInterval interval : intervals) { result.append(interval).append(' '); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java index 7acd7572..87f435b4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 +Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java index 2efa038d..6247de84 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java @@ -5,27 +5,30 @@ * subspace clustering algorithms according to the classical but somewhat obsolete classification schema * of clustering algorithms for axis-parallel subspaces. * + * @apiviz.exclude ^de.lmu.ifi.dbs.elki.algorithm.Abstract + * @apiviz.exclude ^de.lmu.ifi.dbs.elki.algorithm\.clustering\.subspace\.P3C\.(ClusterCandidate|Signature) + * @apiviz.exclude ^de.lmu.ifi.dbs.elki.algorithm\.clustering\.subspace\.clique\. */ /* -This file is part of ELKI: -Environment for Developing KDD-Applications Supported by Index-Structures + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2013 -Ludwig-Maximilians-Universität München -Lehr- und Forschungseinheit für Datenbanksysteme -ELKI Development Team + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
\ No newline at end of file |