summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java21
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java51
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java364
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java350
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java59
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java148
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java153
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java302
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java900
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java28
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java25
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java96
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java89
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java217
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java195
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java33
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java346
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java155
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java53
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java75
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java23
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java384
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java605
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java160
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java1000
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java6
48 files changed, 5330 insertions, 730 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
index 0c4eb5fc..96c95a9f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
@@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistance
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -152,8 +152,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext
*/
protected void configK(Parameterization config) {
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
}
@@ -165,8 +165,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext
*/
protected void configKI(Parameterization config) {
IntParameter k_iP = new IntParameter(K_I_ID, 30);
- k_iP.addConstraint(new GreaterConstraint(0));
- if (config.grab(k_iP)) {
+ k_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(k_iP)) {
k_i = k_iP.getValue();
}
}
@@ -178,8 +178,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext
*/
protected void configL(Parameterization config) {
IntParameter lP = new IntParameter(L_ID);
- lP.addConstraint(new GreaterConstraint(0));
- if (config.grab(lP)) {
+ lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(lP)) {
l = lP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
index ee3b234c..52e37197 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -294,7 +294,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
ModifiableDBIDs seeds = DBIDUtil.newHashSet();
- for (DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) {
+ for(DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) {
int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension();
// nextID is not reachable from start object
if(nextID_corrDim > lambda) {
@@ -322,9 +322,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
DistanceDBIDList<DoubleDistance> reachables = rangeQuery.getRangeForDBID(iter, epsilon);
iter.remove();
-
+
if(reachables.size() > minpts) {
- for (DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) {
+ for(DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) {
int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension();
// r is not reachable from q
if(corrDim_r > lambda) {
@@ -351,9 +351,10 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
}
}
- /* if(processedIDs.size() == relation.size() && noise.size() == 0) {
- break;
- } */
+ /*
+ * if(processedIDs.size() == relation.size() && noise.size() == 0) {
+ * break; }
+ */
}
if(currentCluster.size() >= minpts) {
@@ -375,7 +376,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(distanceFunction.getInputTypeRestriction());
}
-
+
/**
* Parameterization class.
*
@@ -411,7 +412,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
protected void configMinPts(Parameterization config) {
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
@@ -435,7 +436,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
protected void configLambda(Parameterization config) {
IntParameter lambdaP = new IntParameter(LAMBDA_ID);
- lambdaP.addConstraint(new GreaterConstraint(0));
+ lambdaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(lambdaP)) {
lambda = lambdaP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
index 57dcb435..09c78fec 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
@@ -38,9 +38,8 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
@@ -52,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -82,24 +81,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
private static final Logging LOG = Logging.getLogger(DBSCAN.class);
/**
- * Parameter to specify the maximum radius of the neighborhood to be
- * considered, must be suitable to the distance function specified.
+ * Holds the epsilon radius threshold.
*/
- public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered.");
+ protected D epsilon;
/**
- * Holds the value of {@link #EPSILON_ID}.
- */
- private D epsilon;
-
- /**
- * Parameter to specify the threshold for minimum number of points in the
- * epsilon-neighborhood of a point, must be an integer greater than 0.
- */
- public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point.");
-
- /**
- * Holds the value of {@link #MINPTS_ID}.
+ * Holds the minimum cluster size.
*/
protected int minpts;
@@ -146,7 +133,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
if(size < minpts) {
// The can't be any clusters
noise.addDBIDs(relation.getDBIDs());
- objprog.setProcessed(noise.size(), LOG);
+ if(objprog != null) {
+ objprog.setProcessed(noise.size(), LOG);
+ }
}
else {
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
@@ -193,7 +182,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
* @param objprog the progress object for logging the current status
*/
protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
- DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
+ DBIDs neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
// startObject is no core-object
if(neighbors.size() < minpts) {
@@ -207,7 +196,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
// try to expand the cluster
- HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
+ ModifiableDBIDs seeds = DBIDUtil.newHashSet();
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
for(DBIDIter seed = neighbors.iter(); seed.valid(); seed.advance()) {
if(!processedIDs.contains(seed)) {
@@ -222,9 +211,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
seeds.remove(startObjectID);
- while(seeds.size() > 0) {
+ while(!seeds.isEmpty()) {
DBIDMIter o = seeds.iter();
- DistanceDBIDList<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon);
+ DBIDs neighborhood = rangeQuery.getRangeForDBID(o, epsilon);
o.remove();
if(neighborhood.size() >= minpts) {
@@ -282,6 +271,18 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
* @apiviz.exclude
*/
public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ /**
+ * Parameter to specify the maximum radius of the neighborhood to be
+ * considered, must be suitable to the distance function specified.
+ */
+ public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered.");
+
+ /**
+ * Parameter to specify the threshold for minimum number of points in the
+ * epsilon-neighborhood of a point, must be an integer greater than 0.
+ */
+ public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point.");
+
protected D epsilon = null;
protected int minpts = 0;
@@ -295,7 +296,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
@@ -306,4 +307,4 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
return new DBSCAN<>(distanceFunction, epsilon, minpts);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
index 3c2e0278..814b4cc4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
@@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -496,7 +496,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
index c66442a1..e82ec674 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
@@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
@@ -41,14 +42,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
@@ -57,8 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -72,8 +73,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* zero-covariance and variance=1 in covariance matrices.
* </p>
* <p>
- * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin: Maximum Likelihood from
- * Incomplete Data via the EM algorithm. <br>
+ * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin:<br />
+ * Maximum Likelihood from Incomplete Data via the EM algorithm.<br>
* In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31
* </p>
*
@@ -100,48 +101,36 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
private static final double SINGULARITY_CHEAT = 1E-9;
/**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater than 0.
- */
- public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find.");
-
- /**
- * Holds the value of {@link #K_ID}.
+ * Number of clusters
*/
private int k;
/**
- * Parameter to specify the termination criterion for maximization of E(M):
- * E(M) - E(M') < em.delta, must be a double equal to or greater than 0.
+ * Delta parameter
*/
- public static final OptionID DELTA_ID = new OptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta");
+ private double delta;
/**
- * Parameter to specify the initialization method
+ * Class to choose the initial means
*/
- public static final OptionID INIT_ID = new OptionID("kmeans.initialization", "Method to choose the initial means.");
-
- private static final double MIN_LOGLIKELIHOOD = -100000;
+ private KMeansInitialization<V> initializer;
/**
- * Holds the value of {@link #DELTA_ID}.
+ * Maximum number of iterations to allow
*/
- private double delta;
+ private int maxiter;
/**
- * Store the individual probabilities, for use by EMOutlierDetection etc.
+ * Retain soft assignments.
*/
- private WritableDataStore<double[]> probClusterIGivenX;
+ private boolean soft;
- /**
- * Class to choose the initial means
- */
- private KMeansInitialization<V> initializer;
+ private static final double MIN_LOGLIKELIHOOD = -100000;
/**
- * Maximum number of iterations to allow
+ * Soft assignment result type.
*/
- private int maxiter;
+ public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class);
/**
* Constructor.
@@ -150,13 +139,15 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @param delta delta parameter
* @param initializer Class to choose the initial means
* @param maxiter Maximum number of iterations
+ * @param soft Include soft assignments
*/
- public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter) {
+ public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter, boolean soft) {
super();
this.k = k;
this.delta = delta;
this.initializer = initializer;
this.maxiter = maxiter;
+ this.setSoft(soft);
}
/**
@@ -172,137 +163,80 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @return Result
*/
public Clustering<EMModel<V>> run(Database database, Relation<V> relation) {
- if (relation.size() == 0) {
+ if(relation.size() == 0) {
throw new IllegalArgumentException("database empty: must contain elements");
}
// initial models
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("initializing " + k + " models");
}
- List<Vector> means = new ArrayList<>();
- for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC)) {
- means.add(nv.getColumnVector());
+ final List<V> initialMeans = initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC);
+ assert (initialMeans.size() == k);
+ Vector[] means = new Vector[k];
+ {
+ int i = 0;
+ for(NumberVector<?> nv : initialMeans) {
+ means[i] = nv.getColumnVector();
+ i++;
+ }
}
- List<Matrix> covarianceMatrices = new ArrayList<>(k);
+ Matrix[] covarianceMatrices = new Matrix[k];
double[] normDistrFactor = new double[k];
- List<Matrix> invCovMatr = new ArrayList<>(k);
+ Matrix[] invCovMatr = new Matrix[k];
double[] clusterWeights = new double[k];
- probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
- final int dimensionality = means.get(0).getDimensionality();
- for (int i = 0; i < k; i++) {
+ final int dimensionality = means[0].getDimensionality();
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality);
+ for(int i = 0; i < k; i++) {
Matrix m = Matrix.identity(dimensionality, dimensionality);
- covarianceMatrices.add(m);
- final double det = m.det();
- if (det > 0.) {
- normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det);
- } else {
- LOG.warning("Encountered matrix with 0 determinant - degenerated.");
- normDistrFactor[i] = 1.0; // Not really well defined
- }
- invCovMatr.add(m.inverse());
+ covarianceMatrices[i] = m;
+ normDistrFactor[i] = 1.0 / Math.sqrt(norm);
+ invCovMatr[i] = Matrix.identity(dimensionality, dimensionality);
clusterWeights[i] = 1.0 / k;
- if (LOG.isDebuggingFinest()) {
- StringBuilder msg = new StringBuilder();
- msg.append(" model ").append(i).append(":\n");
- msg.append(" mean: ").append(means.get(i)).append('\n');
- msg.append(" m:\n").append(FormatUtil.format(m, " ")).append('\n');
- msg.append(" m.det(): ").append(det).append('\n');
- msg.append(" cluster weight: ").append(clusterWeights[i]).append('\n');
- msg.append(" normDistFact: ").append(normDistrFactor[i]).append('\n');
- LOG.debugFine(msg.toString());
- }
}
double emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
// iteration unless no change
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("iterating EM");
}
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("iteration " + 0 + " - expectation value: " + emNew);
}
- double em;
- for (int it = 1; it <= maxiter || maxiter < 0; it++) {
- em = emNew;
-
- // recompute models
- List<Vector> meanSums = new ArrayList<>(k);
- double[] sumOfClusterProbabilities = new double[k];
-
- for (int i = 0; i < k; i++) {
- clusterWeights[i] = 0.0;
- meanSums.add(new Vector(dimensionality));
- covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality));
- }
-
- // weights and means
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- double[] clusterProbabilities = probClusterIGivenX.get(iditer);
-
- for (int i = 0; i < k; i++) {
- sumOfClusterProbabilities[i] += clusterProbabilities[i];
- Vector summand = relation.get(iditer).getColumnVector().timesEquals(clusterProbabilities[i]);
- meanSums.get(i).plusEquals(summand);
- }
- }
- final int n = relation.size();
- for (int i = 0; i < k; i++) {
- clusterWeights[i] = sumOfClusterProbabilities[i] / n;
- Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]);
- means.set(i, newMean);
- }
- // covariance matrices
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- double[] clusterProbabilities = probClusterIGivenX.get(iditer);
- Vector instance = relation.get(iditer).getColumnVector();
- for (int i = 0; i < k; i++) {
- Vector difference = instance.minus(means.get(i));
- covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i]));
- }
- }
- for (int i = 0; i < k; i++) {
- covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT));
- }
- for (int i = 0; i < k; i++) {
- final double det = covarianceMatrices.get(i).det();
- if (det > 0.) {
- normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det);
- } else {
- LOG.warning("Encountered matrix with 0 determinant - degenerated.");
- normDistrFactor[i] = 1.0; // Not really well defined
- }
- invCovMatr.set(i, covarianceMatrices.get(i).inverse());
- }
+ for(int it = 1; it <= maxiter || maxiter < 0; it++) {
+ final double emOld = emNew;
+ recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dimensionality);
+ computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
// reassign probabilities
emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
- if (Math.abs(em - emNew) <= delta) {
+ if(Math.abs(emOld - emNew) <= delta) {
break;
}
}
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("assigning clusters");
}
// fill result with clusters and models
List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
hardClusters.add(DBIDUtil.newHashSet());
}
// provide a hard clustering
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double[] clusterProbabilities = probClusterIGivenX.get(iditer);
int maxIndex = 0;
double currentMax = 0.0;
- for (int i = 0; i < k; i++) {
- if (clusterProbabilities[i] > currentMax) {
+ for(int i = 0; i < k; i++) {
+ if(clusterProbabilities[i] > currentMax) {
maxIndex = i;
currentMax = clusterProbabilities[i];
}
@@ -312,24 +246,89 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Clustering<EMModel<V>> result = new Clustering<>("EM Clustering", "em-clustering");
// provide models within the result
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
// TODO: re-do labeling.
// SimpleClassLabel label = new SimpleClassLabel();
// label.init(result.canonicalClusterLabel(i));
- Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i)));
+ Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means[i].getArrayRef()), covarianceMatrices[i]));
result.addToplevelCluster(model);
}
+ if(isSoft()) {
+ result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
+ }
+ else {
+ probClusterIGivenX.destroy();
+ }
return result;
}
/**
+ * Compute the inverse cluster matrices.
+ *
+ * @param covarianceMatrices Input covariance matrices
+ * @param invCovMatr Output array for inverse matrices
+ * @param normDistrFactor Output array for norm distribution factors.
+ * @param norm Normalization factor, usually (2pi)^d
+ */
+ public static void computeInverseMatrixes(Matrix[] covarianceMatrices, Matrix[] invCovMatr, double[] normDistrFactor, final double norm) {
+ int k = covarianceMatrices.length;
+ for(int i = 0; i < k; i++) {
+ final double det = covarianceMatrices[i].det();
+ if(det > 0.) {
+ normDistrFactor[i] = 1. / Math.sqrt(norm * det);
+ }
+ else {
+ LOG.warning("Encountered matrix with 0 determinant - degenerated.");
+ normDistrFactor[i] = 1.; // Not really well defined
+ }
+ invCovMatr[i] = covarianceMatrices[i].inverse();
+ }
+ }
+
+ /**
+ * Recompute the covariance matrixes.
+ *
+ * @param relation Vector data
+ * @param probClusterIGivenX Object probabilities
+ * @param means Cluster means output
+ * @param covarianceMatrices Output covariance matrixes
+ * @param dimensionality Data set dimensionality
+ */
+ public static void recomputeCovarianceMatrices(Relation<? extends NumberVector<?>> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] covarianceMatrices, final int dimensionality) {
+ final int k = means.length;
+ CovarianceMatrix[] cms = new CovarianceMatrix[k];
+ for(int i = 0; i < k; i++) {
+ cms[i] = new CovarianceMatrix(dimensionality);
+ }
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
+ Vector instance = relation.get(iditer).getColumnVector();
+ for(int i = 0; i < k; i++) {
+ if(clusterProbabilities[i] > 0.) {
+ cms[i].put(instance, clusterProbabilities[i]);
+ }
+ }
+ }
+ for(int i = 0; i < k; i++) {
+ if(cms[i].getWeight() <= 0.) {
+ means[i] = new Vector(dimensionality);
+ covarianceMatrices[i] = Matrix.identity(dimensionality, dimensionality);
+ }
+ else {
+ means[i] = cms[i].getMeanVector();
+ covarianceMatrices[i] = cms[i].destroyToNaiveMatrix().cheatToAvoidSingularity(SINGULARITY_CHEAT);
+ }
+ }
+ }
+
+ /**
* Assigns the current probability values to the instances in the database and
* compute the expectation value of the current mixture of distributions.
*
* Computed as the sum of the logarithms of the prior probability of each
* instance.
*
- * @param database the database used for assignment to instances
+ * @param relation the database used for assignment to instances
* @param normDistrFactor normalization factor for density function, based on
* current covariance matrix
* @param means the current means
@@ -337,58 +336,55 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @param clusterWeights the weights of the current clusters
* @return the expectation value of the current mixture of distributions
*/
- protected double assignProbabilitiesToInstances(Relation<V> database, double[] normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
- double emSum = 0.0;
+ public static double assignProbabilitiesToInstances(Relation<? extends NumberVector<?>> relation, double[] normDistrFactor, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
+ final int k = clusterWeights.length;
+ double emSum = 0.;
- for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
- Vector x = database.get(iditer).getColumnVector();
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ Vector x = relation.get(iditer).getColumnVector();
double[] probabilities = new double[k];
- for (int i = 0; i < k; i++) {
- Vector difference = x.minus(means.get(i));
- double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference);
- double power = rowTimesCovTimesCol / 2.0;
+ for(int i = 0; i < k; i++) {
+ Vector difference = x.minus(means[i]);
+ double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr[i], difference);
+ double power = rowTimesCovTimesCol / 2.;
double prob = normDistrFactor[i] * Math.exp(-power);
- if (LOG.isDebuggingFinest()) {
- LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " "));
+ if(LOG.isDebuggingFinest()) {
+ LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + //
+ " difference:\n" + FormatUtil.format(difference, " ") + "\n" + //
+ " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + //
+ " power= " + power + "\n" + " prob=" + prob + "\n" + //
+ " inv cov matrix: \n" + FormatUtil.format(invCovMatr[i], " "));
}
- if (!(prob >= 0.)) {
+ if(!(prob >= 0.)) {
LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor[i]);
+ prob = 0.;
}
probabilities[i] = prob;
}
- double priorProbability = 0.0;
- for (int i = 0; i < k; i++) {
+ double priorProbability = 0.;
+ for(int i = 0; i < k; i++) {
priorProbability += probabilities[i] * clusterWeights[i];
}
double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD);
- if (!Double.isNaN(logP)) {
+ if(!Double.isNaN(logP)) {
emSum += logP;
}
double[] clusterProbabilities = new double[k];
- for (int i = 0; i < k; i++) {
- assert (clusterWeights[i] >= 0.0);
+ for(int i = 0; i < k; i++) {
+ assert (clusterWeights[i] >= 0.);
// do not divide by zero!
- if (priorProbability > 0.0) {
+ if(priorProbability > 0.) {
clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i];
- } else {
- clusterProbabilities[i] = 0.0;
+ }
+ else {
+ clusterProbabilities[i] = 0.;
}
}
probClusterIGivenX.put(iditer, clusterProbabilities);
}
- return emSum;
- }
-
- /**
- * Get the probabilities for a given point.
- *
- * @param index Point ID
- * @return Probabilities of given point
- */
- public double[] getProbClusterIGivenX(DBIDRef index) {
- return probClusterIGivenX.get(index);
+ return emSum / relation.size();
}
@Override
@@ -402,6 +398,20 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
}
/**
+ * @return the soft
+ */
+ public boolean isSoft() {
+ return soft;
+ }
+
+ /**
+ * @param soft the soft to set
+ */
+ public void setSoft(boolean soft) {
+ this.soft = soft;
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
@@ -409,45 +419,77 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the termination criterion for maximization of E(M):
+ * E(M) - E(M') < em.delta, must be a double equal to or greater than 0.
+ */
+ public static final OptionID DELTA_ID = new OptionID("em.delta", //
+ "The termination criterion for maximization of E(M): " + //
+ "E(M) - E(M') < em.delta");
+
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = new OptionID("kmeans.initialization", //
+ "Method to choose the initial means.");
+
+ /**
+ * Number of clusters.
+ */
protected int k;
+ /**
+ * Stopping threshold
+ */
protected double delta;
+ /**
+ * Initialization method
+ */
protected KMeansInitialization<V> initializer;
+ /**
+ * Maximum number of iterations.
+ */
protected int maxiter = -1;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.0);
- deltaP.addConstraint(new GreaterEqualConstraint(0.0));
- if (config.grab(deltaP)) {
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(deltaP)) {
delta = deltaP.getValue();
}
IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
maxiterP.setOptional(true);
- if (config.grab(maxiterP)) {
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.getValue();
}
}
@Override
protected EM<V> makeInstance() {
- return new EM<>(k, delta, initializer, maxiter);
+ return new EM<>(k, delta, initializer, maxiter, false);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
index e928d041..a4a922df 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
@@ -33,10 +33,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.DistanceUtil;
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -146,7 +146,8 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
// boxing/unboxing.
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if(!processedIDs.contains(iditer)) {
- // We need to do some ugly casts to be able to run the optimized version, unfortunately.
+ // We need to do some ugly casts to be able to run the optimized
+ // version, unfortunately.
@SuppressWarnings("unchecked")
final ClusterOrderResult<DoubleDistance> doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder);
@SuppressWarnings("unchecked")
@@ -304,7 +305,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
index 583d402b..db343f3a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
@@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry;
import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
@@ -240,6 +239,10 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm<
// By default, clusters cover both the steep up and steep down area
int cstart = sda.getStartIndex();
int cend = sua.getEndIndex();
+ // Hotfix: never include infinity-reachable points at the end
+ while(cend > cstart && Double.isInfinite(clusterOrder.get(cend).getReachability().doubleValue())) {
+ --cend;
+ }
// However, we sometimes have to adjust this (Condition 4):
{
// Case b)
@@ -654,8 +657,8 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm<
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter xiP = new DoubleParameter(XI_ID);
- xiP.addConstraint(new GreaterEqualConstraint(0.0));
- xiP.addConstraint(new LessConstraint(1.0));
+ xiP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ xiP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
if(config.grab(xiP)) {
xi = xiP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
index 95d9f23c..86bb9a09 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
@@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -328,7 +328,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java
new file mode 100644
index 00000000..68dacf34
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java
@@ -0,0 +1,350 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.iterator.TIntObjectIterator;
+import gnu.trove.map.hash.TIntObjectHashMap;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Cluster analysis by affinity propagation.
+ *
+ * Reference:
+ * <p>
+ * Clustering by Passing Messages Between Data Points<br />
+ * B. J. Frey and D. Dueck<br />
+ * Science Vol 315
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf AffinityPropagationInitialization
+ *
+ * @param <O> object type
+ */
+@Title("Affinity Propagation: Clustering by Passing Messages Between Data Points")
+@Reference(title = "Clustering by Passing Messages Between Data Points", authors = "B. J. Frey and D. Dueck", booktitle = "Science Vol 315", url = "http://dx.doi.org/10.1126/science.1136800")
+public class AffinityPropagationClusteringAlgorithm<O> extends AbstractAlgorithm<Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * Class logger
+ */
+ private static final Logging LOG = Logging.getLogger(AffinityPropagationClusteringAlgorithm.class);
+
+ /**
+ * Similarity initialization
+ */
+ AffinityPropagationInitialization<O> initialization;
+
+ /**
+ * Damping factor lambda.
+ */
+ double lambda = 0.5;
+
+ /**
+ * Terminate after 10 iterations with no changes.
+ */
+ int convergence = 10;
+
+ /**
+ * Maximum number of iterations.
+ */
+ int maxiter = 1000;
+
+ /**
+ * Constructor.
+ *
+ * @param initialization Similarity initialization
+ * @param lambda Damping factor
+ * @param convergence Termination threshold (Number of stable iterations)
+ * @param maxiter Maximum number of iterations
+ */
+ public AffinityPropagationClusteringAlgorithm(AffinityPropagationInitialization<O> initialization, double lambda, int convergence, int maxiter) {
+ super();
+ this.initialization = initialization;
+ this.lambda = lambda;
+ this.convergence = convergence;
+ this.maxiter = maxiter;
+ }
+
+ /**
+ * Perform affinity propagation clustering.
+ *
+ * @param db Database
+ * @param relation Relation
+ * @return Clustering result
+ */
+ public Clustering<MedoidModel> run(Database db, Relation<O> relation) {
+ ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
+ final int size = ids.size();
+
+ int[] assignment = new int[size];
+ double[][] s = initialization.getSimilarityMatrix(db, relation, ids);
+ double[][] r = new double[size][size];
+ double[][] a = new double[size][size];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null;
+ MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null;
+
+ int inactive = 0;
+ for(int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) {
+ // Update responsibility matrix:
+ for(int i = 0; i < size; i++) {
+ double[] ai = a[i], ri = r[i], si = s[i];
+ // Find the two largest values (as initially maxk == i)
+ double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY;
+ int maxk = -1;
+ for(int k = 0; k < size; k++) {
+ double val = ai[k] + si[k];
+ if(val > max1) {
+ max2 = max1;
+ max1 = val;
+ maxk = k;
+ }
+ else if(val > max2) {
+ max2 = val;
+ }
+ }
+ // With the maximum value known, update r:
+ for(int k = 0; k < size; k++) {
+ double val = si[k] - ((k != maxk) ? max1 : max2);
+ ri[k] = ri[k] * lambda + val * (1. - lambda);
+ }
+ }
+ // Update availability matrix
+ for(int k = 0; k < size; k++) {
+ // Compute sum of max(0, r_ik) for all i.
+ // For r_kk, don't apply the max.
+ double colposum = 0.;
+ for(int i = 0; i < size; i++) {
+ if(i == k || r[i][k] > 0.) {
+ colposum += r[i][k];
+ }
+ }
+ for(int i = 0; i < size; i++) {
+ double val = colposum;
+ // Adjust column sum by the one extra term.
+ if(i == k || r[i][k] > 0.) {
+ val -= r[i][k];
+ }
+ if(i != k && val > 0.) { // min
+ val = 0.;
+ }
+ a[i][k] = a[i][k] * lambda + val * (1 - lambda);
+ }
+ }
+ int changed = 0;
+ for(int i = 0; i < size; i++) {
+ double[] ai = a[i], ri = r[i];
+ double max = Double.NEGATIVE_INFINITY;
+ int maxj = -1;
+ for(int j = 0; j < size; j++) {
+ double v = ai[j] + ri[j];
+ if(v > max || (i == j && v >= max)) {
+ max = v;
+ maxj = j;
+ }
+ }
+ if(assignment[i] != maxj) {
+ changed += 1;
+ assignment[i] = maxj;
+ }
+ }
+ inactive = (changed > 0) ? 0 : (inactive + 1);
+ if(prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ if(aprog != null) {
+ aprog.setProcessed(size - changed, LOG);
+ }
+ }
+ if(aprog != null) {
+ aprog.setProcessed(aprog.getTotal(), LOG);
+ }
+ if(prog != null) {
+ prog.setCompleted(LOG);
+ }
+ // Cluster map, by lead object
+ TIntObjectHashMap<ModifiableDBIDs> map = new TIntObjectHashMap<>();
+ DBIDArrayIter i1 = ids.iter();
+ for(int i = 0; i1.valid(); i1.advance(), i++) {
+ int c = assignment[i];
+ // Add to cluster members:
+ ModifiableDBIDs cids = map.get(c);
+ if(cids == null) {
+ cids = DBIDUtil.newArray();
+ map.put(c, cids);
+ }
+ cids.add(i1);
+ }
+ // If we stopped early, the cluster lead might be in a different cluster.
+ for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) {
+ iter.advance(); // Trove iterator; advance first!
+ final int key = iter.key();
+ int targetkey = key;
+ ModifiableDBIDs tids = null;
+ // Chase arrows:
+ while(ids == null && assignment[targetkey] != targetkey) {
+ targetkey = assignment[targetkey];
+ tids = map.get(targetkey);
+ }
+ if(tids != null && targetkey != key) {
+ tids.addDBIDs(iter.value());
+ iter.remove();
+ }
+ }
+
+ Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering");
+ ModifiableDBIDs noise = DBIDUtil.newArray();
+ for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) {
+ iter.advance(); // Trove iterator; advance first!
+ i1.seek(iter.key());
+ if(iter.value().size() > 1) {
+ MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1));
+ clustering.addToplevelCluster(new Cluster<>(iter.value(), mod));
+ }
+ else {
+ noise.add(i1);
+ }
+ }
+ if(noise.size() > 0) {
+ MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter()));
+ clustering.addToplevelCluster(new Cluster<>(noise, true, mod));
+ }
+ return clustering;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(initialization.getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <O> object type
+ */
+ public static class Parameterizer<O> extends AbstractParameterizer {
+ /**
+ * Parameter for the similarity matrix initialization
+ */
+ public static final OptionID INITIALIZATION_ID = new OptionID("ap.initialization", "Similarity matrix initialization..");
+
+ /**
+ * Parameter for the dampening factor.
+ */
+ public static final OptionID LAMBDA_ID = new OptionID("ap.lambda", "Dampening factor lambda. Usually 0.5 to 1.");
+
+ /**
+ * Parameter for the convergence factor.
+ */
+ public static final OptionID CONVERGENCE_ID = new OptionID("ap.convergence", "Number of stable iterations for convergence.");
+
+ /**
+ * Parameter for the convergence factor.
+ */
+ public static final OptionID MAXITER_ID = new OptionID("ap.maxiter", "Maximum number of iterations.");
+
+ /**
+ * Initialization function for the similarity matrix.
+ */
+ AffinityPropagationInitialization<O> initialization;
+
+ /**
+ * Dampening parameter.
+ */
+ double lambda = .5;
+
+ /**
+ * Number of stable iterations for convergence.
+ */
+ int convergence;
+
+ /**
+ * Maximum number of iterations.
+ */
+ int maxiter;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final ObjectParameter<AffinityPropagationInitialization<O>> param = new ObjectParameter<>(INITIALIZATION_ID, AffinityPropagationInitialization.class, DistanceBasedInitializationWithMedian.class);
+ if(config.grab(param)) {
+ initialization = param.instantiateClass(config);
+ }
+ final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, .5);
+ lambdaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ lambdaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(lambdaP)) {
+ lambda = lambdaP.doubleValue();
+ }
+ final IntParameter convergenceP = new IntParameter(CONVERGENCE_ID, 15);
+ convergenceP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(convergenceP)) {
+ convergence = convergenceP.intValue();
+ }
+ final IntParameter maxiterP = new IntParameter(MAXITER_ID, 1000);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.intValue();
+ }
+ }
+
+ @Override
+ protected AffinityPropagationClusteringAlgorithm<O> makeInstance() {
+ return new AffinityPropagationClusteringAlgorithm<>(initialization, lambda, convergence, maxiter);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java
new file mode 100644
index 00000000..5dbc54de
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java
@@ -0,0 +1,59 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
+
+/**
+ * Initialization methods for affinity propagation.
+ *
+ * @author Erich Schubert
+ */
+public interface AffinityPropagationInitialization<O> extends Parameterizable {
+ /**
+ * Quantile to use for the diagonal entries.
+ */
+ public static final OptionID QUANTILE_ID = new OptionID("ap.quantile", "Quantile to use for diagonal entries.");
+
+ /**
+ * Compute the initial similarity matrix.
+ *
+ * @param db Database
+ * @param relation Data relation
+ * @param ids indexed DBIDs
+ * @return Similarity matrix
+ */
+ double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids);
+
+ /**
+ * Get the data type information for the similarity computations.
+ *
+ * @return Data type
+ */
+ TypeInformation getInputTypeRestriction();
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java
new file mode 100644
index 00000000..2c8cabf9
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java
@@ -0,0 +1,148 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Distance based initialization.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+public class DistanceBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> {
+ /**
+ * Distance function.
+ */
+ DistanceFunction<? super O, D> distance;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ /**
+ * Constructor.
+ *
+ * @param distance Similarity function
+ * @param quantile Quantile
+ */
+ public DistanceBasedInitializationWithMedian(DistanceFunction<? super O, D> distance, double quantile) {
+ super();
+ this.distance = distance;
+ this.quantile = quantile;
+ }
+
+ @Override
+ public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) {
+ final int size = ids.size();
+ DistanceQuery<O, D> dq = db.getDistanceQuery(relation, distance);
+ double[][] mat = new double[size][size];
+ double[] flat = new double[(size * (size - 1)) >> 1];
+ // TODO: optimize for double valued primitive distances.
+ DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
+ for (int i = 0, j = 0; i < size; i++, i1.advance()) {
+ double[] mati = mat[i];
+ i2.seek(i + 1);
+ for (int k = i + 1; k < size; k++, i2.advance()) {
+ mati[k] = -dq.distance(i1, i2).doubleValue();
+ mat[k][i] = mati[k]; // symmetry.
+ flat[j] = mati[k];
+ j++;
+ }
+ }
+ double median = QuickSelect.quantile(flat, quantile);
+ // On the diagonal, we place the median
+ for (int i = 0; i < size; i++) {
+ mat[i][i] = median;
+ }
+ return mat;
+ }
+
+ @Override
+ public TypeInformation getInputTypeRestriction() {
+ return distance.getInputTypeRestriction();
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the distance function.
+ */
+ public static final OptionID DISTANCE_ID = new OptionID("ap.distance", "Distance function to use.");
+
+ /**
+ * istance function.
+ */
+ DistanceFunction<? super O, D> distance;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<DistanceFunction<? super O, D>> param = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class, SquaredEuclideanDistanceFunction.class);
+ if (config.grab(param)) {
+ distance = param.instantiateClass(config);
+ }
+
+ DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5);
+ if (config.grab(quantileP)) {
+ quantile = quantileP.doubleValue();
+ }
+ }
+
+ @Override
+ protected DistanceBasedInitializationWithMedian<O, D> makeInstance() {
+ return new DistanceBasedInitializationWithMedian<>(distance, quantile);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java
new file mode 100644
index 00000000..a138da96
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java
@@ -0,0 +1,153 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.LinearKernelFunction;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Similarity based initialization.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+public class SimilarityBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> {
+ /**
+ * Similarity function.
+ */
+ SimilarityFunction<? super O, D> similarity;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ /**
+ * Constructor.
+ *
+ * @param similarity Similarity function
+ * @param quantile Quantile
+ */
+ public SimilarityBasedInitializationWithMedian(SimilarityFunction<? super O, D> similarity, double quantile) {
+ super();
+ this.similarity = similarity;
+ this.quantile = quantile;
+ }
+
+ @Override
+ public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) {
+ final int size = ids.size();
+ SimilarityQuery<O, D> sq = db.getSimilarityQuery(relation, similarity);
+ double[][] mat = new double[size][size];
+ double[] flat = new double[(size * (size - 1)) >> 1];
+ // TODO: optimize for double valued primitive distances.
+ DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
+ // Compute self-similarities first, for centering:
+ for (int i = 0; i < size; i++, i1.advance()) {
+ mat[i][i] = sq.similarity(i1, i1).doubleValue() * .5;
+ }
+ i1.seek(0);
+ for (int i = 0, j = 0; i < size; i++, i1.advance()) {
+ final double[] mati = mat[i]; // Probably faster access.
+ i2.seek(i + 1);
+ for (int k = i + 1; k < size; k++, i2.advance()) {
+ mati[k] = sq.similarity(i1, i2).doubleValue() - mati[i] - mat[k][k];
+ mat[k][i] = mati[k]; // symmetry.
+ flat[j] = mati[k];
+ j++;
+ }
+ }
+ double median = QuickSelect.quantile(flat, quantile);
+ // On the diagonal, we place the median
+ for (int i = 0; i < size; i++) {
+ mat[i][i] = median;
+ }
+ return mat;
+ }
+
+ @Override
+ public TypeInformation getInputTypeRestriction() {
+ return similarity.getInputTypeRestriction();
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the similarity function.
+ */
+ public static final OptionID SIMILARITY_ID = new OptionID("ap.similarity", "Similarity function to use.");
+
+ /**
+ * Similarity function.
+ */
+ SimilarityFunction<? super O, D> similarity;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<SimilarityFunction<? super O, D>> param = new ObjectParameter<>(SIMILARITY_ID, SimilarityFunction.class, LinearKernelFunction.class);
+ if (config.grab(param)) {
+ similarity = param.instantiateClass(config);
+ }
+
+ DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5);
+ if (config.grab(quantileP)) {
+ quantile = quantileP.doubleValue();
+ }
+ }
+
+ @Override
+ protected SimilarityBasedInitializationWithMedian<O, D> makeInstance() {
+ return new SimilarityBasedInitializationWithMedian<>(similarity, quantile);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java
new file mode 100644
index 00000000..bc6059ac
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Affinity Propagation (AP) clustering.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java
new file mode 100644
index 00000000..8b875340
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java
@@ -0,0 +1,302 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.BiclusterModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+
+/**
+ * Abstract class as a convenience for different biclustering approaches.
+ * <p/>
+ * The typically required values describing submatrices are computed using the
+ * corresponding values within a database of NumberVectors.
+ * <p/>
+ * The database is supposed to present a data matrix with a row representing an
+ * entry ({@link NumberVector}), a column representing a dimension (attribute)
+ * of the {@link NumberVector}s.
+ *
+ * @author Arthur Zimek
+ * @param <V> a certain subtype of NumberVector - the data matrix is supposed to
+ * consist of rows where each row relates to an object of type V and the
+ * columns relate to the attribute values of these objects
+ * @param <M> Cluster model type
+ */
+public abstract class AbstractBiclustering<V extends NumberVector<?>, M extends BiclusterModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> {
+ /**
+ * Keeps the currently set database.
+ */
+ private Database database;
+
+ /**
+ * Relation we use.
+ */
+ protected Relation<V> relation;
+
+ /**
+ * Iterator to use for more efficient random access.
+ */
+ private DBIDArrayIter iter;
+
+ /**
+ * The row ids corresponding to the currently set {@link #relation}.
+ */
+ protected ArrayDBIDs rowIDs;
+
+ /**
+ * Column dimensionality.
+ */
+ private int colDim;
+
+ /**
+ * Constructor.
+ */
+ protected AbstractBiclustering() {
+ super();
+ }
+
+ /**
+ * Prepares the algorithm for running on a specific database.
+ * <p/>
+ * Assigns the database, the row ids, and the col ids, then calls
+ * {@link #biclustering()}.
+ * <p/>
+ * Any concrete algorithm should be implemented within method
+ * {@link #biclustering()} by an inheriting biclustering approach.
+ *
+ * @param relation Relation to process
+ * @return Clustering result
+ */
+ public final Clustering<M> run(Relation<V> relation) {
+ this.relation = relation;
+ if (this.relation == null || this.relation.size() == 0) {
+ throw new IllegalArgumentException(ExceptionMessages.DATABASE_EMPTY);
+ }
+ colDim = RelationUtil.dimensionality(relation);
+ rowIDs = DBIDUtil.ensureArray(this.relation.getDBIDs());
+ iter = rowIDs.iter();
+ return biclustering();
+ }
+
+ /**
+ * Run the actual biclustering algorithm.
+ * <p/>
+ * This method is supposed to be called only from the method
+ * {@link #run}.
+ * <p/>
+ */
+ protected abstract Clustering<M> biclustering();
+
+ /**
+ * Convert a bitset into integer column ids.
+ *
+ * @param cols
+ * @return integer column ids
+ */
+ protected int[] colsBitsetToIDs(BitSet cols) {
+ int[] colIDs = new int[cols.cardinality()];
+ int colsIndex = 0;
+ for (int i = cols.nextSetBit(0); i >= 0; i = cols.nextSetBit(i + 1)) {
+ colIDs[colsIndex] = i;
+ colsIndex++;
+ }
+ return colIDs;
+ }
+
+ /**
+ * Convert a bitset into integer row ids.
+ *
+ * @param rows
+ * @return integer row ids
+ */
+ protected ArrayDBIDs rowsBitsetToIDs(BitSet rows) {
+ ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray(rows.cardinality());
+ DBIDArrayIter iter = this.rowIDs.iter();
+ for (int i = rows.nextSetBit(0); i >= 0; i = rows.nextSetBit(i + 1)) {
+ iter.seek(i);
+ rowIDs.add(iter);
+ }
+ return rowIDs;
+ }
+
+ /**
+ * Defines a Bicluster as given by the included rows and columns.
+ *
+ * @param rows the rows included in the Bicluster
+ * @param cols the columns included in the Bicluster
+ * @return a Bicluster as given by the included rows and columns
+ */
+ protected Cluster<BiclusterModel> defineBicluster(BitSet rows, BitSet cols) {
+ ArrayDBIDs rowIDs = rowsBitsetToIDs(rows);
+ int[] colIDs = colsBitsetToIDs(cols);
+ return new Cluster<>(rowIDs, new BiclusterModel(colIDs));
+ }
+
+ /**
+ * Defines a Bicluster as given by the included rows and columns.
+ *
+ * @param rows the rows included in the Bicluster
+ * @param cols the columns included in the Bicluster
+ * @return A Bicluster as given by the included rows and columns
+ */
+ protected Cluster<BiclusterModel> defineBicluster(long[] rows, long[] cols) {
+ ArrayDBIDs rowIDs = rowsBitsetToIDs(rows);
+ int[] colIDs = colsBitsetToIDs(cols);
+ return new Cluster<>(rowIDs, new BiclusterModel(colIDs));
+ }
+
+ /**
+ * Returns the value of the data matrix at row <code>row</code> and column
+ * <code>col</code>.
+ *
+ * @param row the row in the data matrix according to the current order of
+ * rows (refers to database entry
+ * <code>database.get(rowIDs[row])</code>)
+ * @param col the column in the data matrix according to the current order of
+ * rows (refers to the attribute value of an database entry
+ * <code>getValue(colIDs[col])</code>)
+ * @return the attribute value of the database entry as retrieved by
+ * <code>database.get(rowIDs[row]).getValue(colIDs[col])</code>
+ */
+ protected double valueAt(int row, int col) {
+ iter.seek(row);
+ return relation.get(iter).doubleValue(col);
+ }
+
+ /**
+ * Get the DBID of a certain row
+ *
+ * @param row Row number
+ * @return DBID of this row
+ * @deprecated Expensive!
+ */
+ @Deprecated
+ protected DBID getRowDBID(int row) {
+ return rowIDs.get(row);
+ }
+
+ /**
+ * Convert a bitset into integer column ids.
+ *
+ * @param cols
+ * @return integer column ids
+ */
+ protected int[] colsBitsetToIDs(long[] cols) {
+ int[] colIDs = new int[(int) BitsUtil.cardinality(cols)];
+ int colsIndex = 0;
+ for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ if (clong == 0L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for (int j = 0; j < Long.SIZE; ++j, ++cpos, clong >>>= 1) {
+ if ((clong & 1L) == 1L) {
+ colIDs[colsIndex] = cpos;
+ ++colsIndex;
+ }
+ }
+ }
+ return colIDs;
+ }
+
+ /**
+ * Convert a bitset into integer row ids.
+ *
+ * @param rows
+ * @return integer row ids
+ */
+ protected ArrayDBIDs rowsBitsetToIDs(long[] rows) {
+ ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray((int) BitsUtil.cardinality(rows));
+ DBIDArrayIter iter = this.rowIDs.iter();
+ outer: for (int rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if (rlong == 0L) {
+ iter.advance(Long.SIZE);
+ continue;
+ }
+ for (int i = 0; i < Long.SIZE; ++i, rlong >>>= 1, iter.advance()) {
+ if (!iter.valid()) {
+ break outer;
+ }
+ if ((rlong & 1L) == 1L) {
+ rowIDs.add(iter);
+ }
+ }
+ }
+ return rowIDs;
+ }
+
+ /**
+ * Provides the number of rows of the data matrix.
+ *
+ * @return the number of rows of the data matrix
+ */
+ protected int getRowDim() {
+ return this.rowIDs.size();
+ }
+
+ /**
+ * Provides the number of columns of the data matrix.
+ *
+ * @return the number of columns of the data matrix
+ */
+ protected int getColDim() {
+ return colDim;
+ }
+
+ /**
+ * Getter for database.
+ *
+ * @return database
+ */
+ public Database getDatabase() {
+ return database;
+ }
+
+ /**
+ * Getter for the relation.
+ *
+ * @return relation
+ */
+ public Relation<V> getRelation() {
+ return relation;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
new file mode 100644
index 00000000..e110faff
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
@@ -0,0 +1,900 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Perform Cheng and Church biclustering.
+ *
+ * <p>
+ * Reference: <br>
+ * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of
+ * the 8th International Conference on Intelligent Systems for Molecular Biology
+ * (ISMB), San Diego, CA, 2000.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @param <V> Vector type.
+ */
+@Reference(authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)")
+public class ChengAndChurch<V extends NumberVector<?>> extends AbstractBiclustering<V, BiclusterWithInversionsModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(ChengAndChurch.class);
+
+ /**
+ * The minimum number of columns that the database must have so that a removal
+ * of columns is performed in {@link #multipleNodeDeletion}.</p>
+ * <p>
+ * Just start deleting multiple columns when more than 100 columns are in the
+ * data matrix.
+ * </p>
+ */
+ private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100;
+
+ /**
+ * The minimum number of rows that the database must have so that a removal of
+ * rows is performed in {@link #multipleNodeDeletion}.
+ * <p>
+ * Just start deleting multiple rows when more than 100 rows are in the data
+ * matrix.
+ * </p>
+ * <!--
+ * <p>
+ * The value is set to 100 as this is not really described in the paper.
+ * </p>
+ * -->
+ */
+ private static final int MIN_ROW_REMOVE_THRESHOLD = 100;
+
+ /**
+ * Threshold for the score.
+ */
+ private double delta;
+
+ /**
+ * The parameter for multiple node deletion.</p>
+ * <p>
+ * It is used to magnify the {@link #delta} value in the
+ * {@link #multipleNodeDeletion} method.
+ * </p>
+ */
+ private double alpha;
+
+ /**
+ * Number of biclusters to be found.
+ */
+ private int n;
+
+ /**
+ * Allow inversion of rows in the last phase.
+ */
+ private boolean useinverted = true;
+
+ /**
+ * Distribution to sample random replacement values from.
+ */
+ private Distribution dist;
+
+ /**
+ * Constructor.
+ *
+ * @param delta Delta parameter: desired quality
+ * @param alpha Alpha parameter: controls switching to single node deletion
+ * approach
+ * @param n Number of clusters to detect
+ * @param dist Distribution of random values to insert
+ */
+ public ChengAndChurch(double delta, double alpha, int n, Distribution dist) {
+ super();
+ this.delta = delta;
+ this.alpha = alpha;
+ this.n = n;
+ this.dist = dist;
+ }
+
+ /**
+ * Visitor pattern for processing cells.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static interface CellVisitor {
+ /** Different modes of operation. */
+ int ALL = 0, SELECTED = 1, NOT_SELECTED = 2;
+
+ /**
+ * Visit a cell.
+ *
+ * @param val Value
+ * @param row Row Number
+ * @param col Column number
+ * @param selrow Boolean, whether row is selected
+ * @param selcol Boolean, whether column is selected
+ * @return Stop flag, return {@code true} to stop visiting
+ */
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol);
+ }
+
+ /**
+ * Bicluster candidate.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ protected static class BiclusterCandidate {
+ /**
+ * Cardinalities.
+ */
+ int rowcard, colcard;
+
+ /**
+ * Means.
+ */
+ double[] rowM, colM;
+
+ /**
+ * Row and column bitmasks.
+ */
+ long[] rows, irow, cols;
+
+ /**
+ * Mean of the current bicluster.
+ */
+ double allM;
+
+ /**
+ * The current bicluster score (mean squared residue).
+ */
+ double residue;
+
+ /**
+ * Constructor.
+ *
+ * @param rows Row dimensionality.
+ * @param cols Column dimensionality.
+ */
+ protected BiclusterCandidate(int rows, int cols) {
+ super();
+ this.rows = BitsUtil.ones(rows);
+ this.irow = BitsUtil.zero(rows);
+ this.rowcard = rows;
+ this.rowM = new double[rows];
+ this.cols = BitsUtil.ones(cols);
+ this.colcard = cols;
+ this.colM = new double[cols];
+ }
+
+ /**
+ * Resets the values for the next cluster search.
+ */
+ protected void reset() {
+ rows = BitsUtil.ones(rowM.length);
+ rowcard = rowM.length;
+ cols = BitsUtil.ones(colM.length);
+ colcard = colM.length;
+ BitsUtil.zeroI(irow);
+ }
+
+ /**
+ * Visit all selected cells in the data matrix.
+ *
+ * @param mat Data matrix
+ * @param mode Operation mode
+ * @param visitor Visitor function
+ */
+ protected void visitAll(double[][] mat, int mode, CellVisitor visitor) {
+ // For efficiency, we manually iterate over the rows and column bitmasks.
+ // This saves repeated shifting needed by the manual bit access.
+ for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
+ boolean rselected = ((rlong & 1L) == 1L);
+ if((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) {
+ continue;
+ }
+ for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ if((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
+ boolean cselected = ((clong & 1L) == 1L);
+ if((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) {
+ continue;
+ }
+ boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Visit a column of the matrix.
+ *
+ * @param mat Data matrix
+ * @param col Column to visit
+ * @param mode Operation mode
+ * @param visitor Visitor function
+ */
+ protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) {
+ boolean cselected = BitsUtil.get(cols, col);
+ // For efficiency, we manually iterate over the rows and column bitmasks.
+ // This saves repeated shifting needed by the manual bit access.
+ for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if(mode == CellVisitor.SELECTED && rlong == 0L) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && rlong == -1L) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
+ boolean rselected = ((rlong & 1L) == 1L);
+ if(mode == CellVisitor.SELECTED && !rselected) {
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && rselected) {
+ continue;
+ }
+ boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+
+ /**
+ * Visit a row of the data matrix.
+ *
+ * @param mat Data matrix
+ * @param row Row to visit
+ * @param visitor Visitor function
+ */
+ protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) {
+ boolean rselected = BitsUtil.get(rows, row);
+ final double[] rowdata = mat[row];
+ for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ // Fast skip blocks of 64 masked values.
+ if(mode == CellVisitor.SELECTED && clong == 0L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && clong == -1L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
+ boolean cselected = ((clong & 1L) == 1L);
+ if(mode == CellVisitor.SELECTED && !cselected) {
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && cselected) {
+ continue;
+ }
+ boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+
+ /** Visitor for updating the means. */
+ private final CellVisitor MEANVISITOR = new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ if(selcol) {
+ rowM[row] += val;
+ }
+ if(selrow) {
+ colM[col] += val;
+ }
+ if(selcol && selrow) {
+ allM += val;
+ }
+ return false;
+ }
+ };
+
+ /**
+ * Update the row means and column means.
+ *
+ * @param mat Data matrix
+ * @param all Flag, to update all
+ * @return overall mean
+ */
+ protected double updateRowAndColumnMeans(final double[][] mat, boolean all) {
+ final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED;
+ Arrays.fill(rowM, 0.);
+ Arrays.fill(colM, 0.);
+ allM = 0.;
+ visitAll(mat, mode, MEANVISITOR);
+ visitColumn(mat, 0, mode, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ rowM[row] /= colcard;
+ return false;
+ }
+ });
+ visitRow(mat, 0, mode, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ colM[col] /= rowcard;
+ return false;
+ }
+ });
+ allM /= colcard * rowcard;
+ return allM;
+ }
+
+ /**
+ * Compute the mean square residue.
+ *
+ * @param mat Data matrix
+ * @return mean squared residue
+ */
+ protected double computeMeanSquaredDeviation(final double[][] mat) {
+ final Mean msr = new Mean();
+ visitAll(mat, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow && selcol);
+ double v = val - rowM[row] - colM[col] + allM;
+ msr.put(v * v);
+ return false;
+ }
+ });
+ residue = msr.getMean();
+ return residue;
+ }
+
+ /**
+ * Computes the <b>mean row residue</b> of the given <code>row</code>.
+ *
+ * @param mat Data matrix
+ * @param row The row who's residue should be computed.
+ * @param rowinverted Indicates if the row should be considered inverted.
+ * @return The row residue of the given <code>row</code>.
+ */
+ protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) {
+ final Mean rowResidue = new Mean();
+ visitRow(mat, row, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ final double rowMean = rowM[row];
+ final double colMean = colM[col];
+ double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM;
+ rowResidue.put(v * v);
+ return false;
+ }
+ });
+ return rowResidue.getMean();
+ }
+
+ /**
+ *
+ * Computes the <b>mean column residue</b> of the given <code>col</code>.
+ *
+ * @param col The column who's residue should be computed.
+ * @return The row residue of the given <code>col</code>um.
+ */
+ protected double computeColResidue(final double[][] mat, final int col) {
+ final double bias = colM[col] - allM;
+ final Mean colResidue = new Mean();
+ visitColumn(mat, col, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ final double rowMean = rowM[row];
+ double v = val - rowMean - bias;
+ colResidue.put(v * v);
+ return false;
+ }
+ });
+ return colResidue.getMean();
+ }
+
+ /**
+ * Updates the mask with replacement values for all data in the given rows
+ * and columns.
+ *
+ * @param mat Mask to update.
+ * @param replacement Distribution to sample replacement values from.
+ */
+ protected void maskMatrix(final double[][] mat, final Distribution replacement) {
+ visitAll(mat, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow && selcol);
+ mat[row][col] = replacement.nextRandom();
+ return false;
+ }
+ });
+ }
+
+ /**
+ * Select or deselect a column.
+ *
+ * @param cnum Column to select
+ * @param set Value to set
+ */
+ protected void selectColumn(int cnum, boolean set) {
+ if(set) {
+ BitsUtil.setI(cols, cnum);
+ colcard++;
+ }
+ else {
+ BitsUtil.clearI(cols, cnum);
+ colcard--;
+ }
+ }
+
+ /**
+ * Select or deselect a row.
+ *
+ * @param rnum Row to select
+ * @param set Value to set
+ */
+ protected void selectRow(int rnum, boolean set) {
+ if(set) {
+ BitsUtil.setI(rows, rnum);
+ rowcard++;
+ }
+ else {
+ BitsUtil.clearI(rows, rnum);
+ rowcard--;
+ }
+ }
+
+ protected void invertRow(int rnum, boolean b) {
+ BitsUtil.setI(irow, rnum);
+ }
+ }
+
+ @Override
+ public Clustering<BiclusterWithInversionsModel> biclustering() {
+ double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);
+
+ BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());
+
+ Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
+ ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
+ for(int i = 0; i < n; i++) {
+ cand.reset();
+ multipleNodeDeletion(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ singleNodeDeletion(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ nodeAddition(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ cand.maskMatrix(mat, dist);
+ BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
+ final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
+ noise.removeDBIDs(cids);
+ result.addToplevelCluster(new Cluster<>(cids, model));
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
+ LOG.verbose("Number of rows: " + cand.rowcard + "\n");
+ LOG.verbose("Number of columns: " + cand.colcard + "\n");
+ // LOG.verbose("Total number of masked values: " + maskedVals.size() +
+ // "\n");
+ }
+ if(prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ }
+ // Add a noise cluster, full-dimensional.
+ if(!noise.isEmpty()) {
+ long[] allcols = BitsUtil.ones(getColDim());
+ BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
+ result.addToplevelCluster(new Cluster<>(noise, true, model));
+ }
+ if(prog != null) {
+ prog.ensureCompleted(LOG);
+ }
+ return result;
+ }
+
+ /**
+ * Algorithm 1 of Cheng and Church:
+ *
+ * Remove single rows or columns.
+ *
+ * Inverted rows are not supported in this method.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
+ // Assume that cand.residue is up to date!
+ while(cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) {
+ // Store current maximum. Need final mutable, so use arrays.
+ final double[] max = { Double.NEGATIVE_INFINITY };
+ final int[] best = { -1, -1 };
+
+ // Test rows
+ if(cand.rowcard > 2) {
+ cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ double rowResidue = cand.computeRowResidue(mat, row, false);
+ if(max[0] < rowResidue) {
+ max[0] = rowResidue;
+ best[0] = row;
+ }
+ return false;
+ }
+ });
+ }
+
+ // Test columns:
+ if(cand.colcard > 2) {
+ cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ double colResidue = cand.computeColResidue(mat, col);
+ if(max[0] < colResidue) {
+ max[0] = colResidue;
+ best[1] = col;
+ }
+ return false;
+ }
+ });
+ }
+
+ if(best[1] >= 0) { // then override bestrow!
+ cand.selectColumn(best[1], false);
+ }
+ else {
+ assert (best[0] >= 0);
+ cand.selectRow(best[0], false);
+ }
+ // TODO: incremental update could be much faster?
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ }
+ }
+
+ //
+ /**
+ * Algorithm 2 of Cheng and Church.
+ *
+ * Remove all rows and columns that reduce the residue by alpha.
+ *
+ * Inverted rows are not supported in this method.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+
+ // Note: assumes that cand.residue = H(I,J)
+ while(cand.residue > delta) {
+ final boolean[] modified = { false, false };
+
+ // Step 2: remove rows above threshold
+ if(cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) {
+ final double alphaResidue = alpha * cand.residue;
+ cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ if(cand.computeRowResidue(mat, row, false) > alphaResidue) {
+ cand.selectRow(row, false);
+ modified[0] = true;
+ }
+ return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD);
+ }
+ });
+
+ // Step 3: update residue
+ if(modified[0]) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+ }
+
+ // Step 4: remove columns above threshold
+ if(cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) {
+ final double alphaResidue = alpha * cand.residue;
+ cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ if(cand.computeColResidue(mat, col) > alphaResidue) {
+ cand.selectColumn(col, false);
+ modified[1] = true;
+ }
+ return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD);
+ }
+ });
+ if(modified[1]) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+ }
+
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ // Step 5: if nothing has been removed, try removing single nodes.
+ if(!modified[0] && !modified[1]) {
+ break;
+ // Will be executed next in main loop, as per algorithm 4.
+ // singleNodeDeletion();
+ }
+ }
+ }
+
+ /**
+ * Algorithm 3 of Cheng and Church.
+ *
+ * Try to re-add rows or columns that decrease the overall score.
+ *
+ * Also try adding inverted rows.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ while(true) {
+ // We need this to be final + mutable
+ final boolean[] added = new boolean[] { false, false };
+
+ // Step 2: add columns
+ cand.visitRow(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selcol);
+ if(cand.computeColResidue(mat, col) <= cand.residue) {
+ cand.selectColumn(col, true);
+ added[0] = true;
+ }
+ return false;
+ }
+ });
+
+ // Step 3: recompute values
+ if(added[0]) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+
+ // Step 4: try adding rows.
+ cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selrow);
+ if(cand.computeRowResidue(mat, row, false) <= cand.residue) {
+ cand.selectRow(row, true);
+ added[1] = true;
+ }
+ return false;
+ }
+ });
+
+ // Step 5: try adding inverted rows.
+ if(useinverted) {
+ cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selrow);
+ if(cand.computeRowResidue(mat, row, true) <= cand.residue) {
+ cand.selectRow(row, true);
+ cand.invertRow(row, true);
+ added[1] = true;
+ }
+ return false;
+ }
+ });
+ }
+ if(added[1]) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ }
+ if(!added[0] && !added[1]) {
+ break;
+ }
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the distribution of replacement values when masking
+ * a cluster.
+ */
+ public static final OptionID DIST_ID = new OptionID("chengandchurch.replacement", "Distribution of replacement values when masking found clusters.");
+
+ /**
+ * Threshold value to determine the maximal acceptable score (mean squared
+ * residue) of a bicluster.
+ * <p/>
+ * Key: {@code -chengandchurch.delta}
+ * </p>
+ */
+ public static final OptionID DELTA_ID = new OptionID("chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster.");
+
+ /**
+ * Parameter for multiple node deletion to accelerate the algorithm. (&gt;=
+ * 1)
+ * <p/>
+ * Key: {@code -chengandchurch.alpha}
+ * </p>
+ */
+ public static final OptionID ALPHA_ID = new OptionID("chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm.");
+
+ /**
+ * Number of biclusters to be found.
+ * <p/>
+ * Default value: 1
+ * </p>
+ * <p/>
+ * Key: {@code -chengandchurch.n}
+ * </p>
+ */
+ public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found.");
+
+ /**
+ * Threshold for the score ({@link #DELTA_ID}).
+ */
+ private double delta;
+
+ /**
+ * The parameter for multiple node deletion.</p>
+ * <p>
+ * It is used to magnify the {@link #delta} value in the
+ * {@link ChengAndChurch#multipleNodeDeletion} method.
+ * </p>
+ */
+ private double alpha;
+
+ /**
+ * Number of biclusters to be found.
+ */
+ private int n;
+
+ /**
+ * Distribution of replacement values.
+ */
+ private Distribution dist;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID);
+ if(config.grab(deltaP)) {
+ delta = deltaP.doubleValue();
+ }
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+
+ IntParameter nP = new IntParameter(N_ID, 1);
+ nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(nP)) {
+ n = nP.intValue();
+ }
+
+ DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.);
+ alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+
+ ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class);
+ if(config.grab(distP)) {
+ dist = distP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected ChengAndChurch<V> makeInstance() {
+ return new ChengAndChurch<>(delta, alpha, n, dist);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java
new file mode 100644
index 00000000..21363bfc
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * <p>Biclustering algorithms.</p>
+ *
+ *
+ */
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2013
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
index 0d82add9..8e5fa627 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
@@ -74,7 +74,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
@@ -838,22 +838,22 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(minptsP)) {
minpts = minptsP.getValue();
}
IntParameter maxlevelP = new IntParameter(MAXLEVEL_ID);
- maxlevelP.addConstraint(new GreaterConstraint(0));
+ maxlevelP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(maxlevelP)) {
maxlevel = maxlevelP.getValue();
}
IntParameter mindimP = new IntParameter(MINDIM_ID, 1);
- mindimP.addConstraint(new GreaterConstraint(0));
+ mindimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(mindimP)) {
mindim = mindimP.getValue();
}
DoubleParameter jitterP = new DoubleParameter(JITTER_ID);
- jitterP.addConstraint(new GreaterConstraint(0));
+ jitterP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
if (config.grab(jitterP)) {
jitter = jitterP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
index 9a4b8512..68878aef 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
@@ -29,7 +29,7 @@ import java.util.Map;
import java.util.Map.Entry;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
@@ -270,7 +270,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs
public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) {
ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters);
ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query);
- reconfig.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist);
+ reconfig.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist);
ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm);
reconfig.failOnErrors();
return instance;
@@ -335,7 +335,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs
ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class);
if(config.grab(algP)) {
ListParameterization predefined = new ListParameterization();
- predefined.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI);
+ predefined.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI);
TrackParameters trackpar = new TrackParameters(config);
ChainedParameterization chain = new ChainedParameterization(predefined, trackpar);
chain.errorsTo(config);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
index d1b714bf..79ddc16e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
@@ -36,9 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -162,33 +160,34 @@ public class HiCO<V extends NumberVector<?>> extends OPTICS<V, PCACorrelationDis
super.makeOptions(config);
IntParameter muP = new IntParameter(MU_ID);
- muP.addConstraint(new GreaterConstraint(0));
- if (config.grab(muP)) {
+ muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(muP)) {
mu = muP.getValue();
}
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
kP.setOptional(true);
final int k;
- if (config.grab(kP)) {
+ if(config.grab(kP)) {
k = kP.getValue();
- } else {
+ }
+ else {
k = mu;
}
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, DEFAULT_DELTA);
- deltaP.addConstraint(new GreaterEqualConstraint(0));
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
double delta = DEFAULT_DELTA;
- if (config.grab(deltaP)) {
+ if(config.grab(deltaP)) {
delta = deltaP.doubleValue();
}
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0));
- alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = DEFAULT_ALPHA;
- if (config.grab(alphaP)) {
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
index f9531be0..99144b42 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -116,7 +116,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
* Number of sampling rounds to find a good split
*/
private final int samplingLevel;
-
+
/**
* Random factory
*/
@@ -163,34 +163,34 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
- Random r = rnd.getRandom();
+ Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
- while (unclustered.size() > minsize) {
+ while(unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
- for (int k = 1; k <= maxdim; k++) {
+ for(int k = 1; k <= maxdim; k++) {
// Implementation note: this while loop is from the original publication
// and the published LMCLUS source code. It doesn't make sense to me -
// it is lacking a stop criterion other than "cluster is too small" and
// "cluster is inseparable"! Additionally, there is good criterion for
// stopping at the appropriate dimensionality either.
- while (true) {
+ while(true) {
Separation separation = findSeparation(relation, current, k, r);
// logger.verbose("k: " + k + " goodness: " + separation.goodness +
// " threshold: " + separation.threshold);
- if (separation.goodness <= sensitivityThreshold) {
+ if(separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
- for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
- if (deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
+ for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
+ if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
- if (subset.size() < minsize) {
+ if(subset.size() < minsize) {
break;
}
current = subset;
@@ -199,7 +199,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
}
}
// No more clusters found
- if (current.size() < minsize || current == unclustered) {
+ if(current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
@@ -210,22 +210,22 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
- if (progress != null) {
+ if(progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
- if (unclustered.size() > 0) {
+ if(unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
- if (progress != null) {
+ if(progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setCompleted(LOG);
}
return ret;
@@ -272,7 +272,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size());
// System.out.println("Number of samples: " + samples);
int remaining_retries = 100;
- for (int i = 1; i <= samples; i++) {
+ for(int i = 1; i <= samples; i++) {
DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong());
final DBIDIter iter = sample.iter();
// Use first as origin
@@ -282,17 +282,17 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
Matrix basis;
{
List<Vector> vectors = new ArrayList<>(sample.size() - 1);
- for (; iter.valid(); iter.advance()) {
+ for(; iter.valid(); iter.advance()) {
Vector vec = relation.get(iter).getColumnVector();
vectors.add(vec.minusEquals(originV));
}
// generate orthogonal basis
basis = generateOrthonormalBasis(vectors);
- if (basis == null) {
+ if(basis == null) {
// new sample has to be taken.
i--;
remaining_retries--;
- if (remaining_retries < 0) {
+ if(remaining_retries < 0) {
throw new AbortException("Too many retries in sampling, and always a linear dependant data set.");
}
continue;
@@ -301,9 +301,9 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
// Generate and fill a histogram.
DoubleDynamicHistogram histogram = new DoubleDynamicHistogram(BINS);
double w = 1.0 / currentids.size();
- for (DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) {
// Skip sampled points
- if (sample.contains(iter2)) {
+ if(sample.contains(iter2)) {
continue;
}
Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV);
@@ -311,7 +311,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
histogram.increment(distance, w);
}
double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold
- if (th[1] > separation.goodness) {
+ if(th[1] > separation.goodness) {
separation.goodness = th[1];
separation.threshold = th[0];
separation.originV = originV;
@@ -341,16 +341,16 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
first = first.times(1.0 / first.euclideanLength());
Matrix ret = new Matrix(first.getDimensionality(), vectors.size());
ret.setCol(0, first);
- for (int i = 1; i < vectors.size(); i++) {
+ for(int i = 1; i < vectors.size(); i++) {
// System.out.println("Matrix:" + ret);
Vector v_i = vectors.get(i);
Vector u_i = v_i.copy();
// System.out.println("Vector " + i + ":" + partialSol);
- for (int j = 0; j < i; j++) {
+ for(int j = 0; j < i; j++) {
Vector v_j = ret.getCol(j);
double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j);
- if (Double.isNaN(f)) {
- if (LOG.isDebuggingFine()) {
+ if(Double.isNaN(f)) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Zero vector encountered? " + v_j);
}
return null;
@@ -359,8 +359,8 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
}
// check if the vectors weren't independent
final double len_u_i = u_i.euclideanLength();
- if (len_u_i == 0.0) {
- if (LOG.isDebuggingFine()) {
+ if(len_u_i == 0.0) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Points not independent - no orthonormalization.");
}
return null;
@@ -391,7 +391,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
{
MeanVariance mv = new MeanVariance();
DoubleHistogram.Iter forward = histogram.iter();
- for (int i = 0; forward.valid(); i++, forward.advance()) {
+ for(int i = 0; forward.valid(); i++, forward.advance()) {
p1[i] = forward.getValue() + ((i > 0) ? p1[i - 1] : 0);
mv.put(i, forward.getValue());
mu1[i] = mv.getMean();
@@ -404,7 +404,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
DoubleHistogram.Iter backwards = histogram.iter();
backwards.seek(histogram.getNumBins() - 1); // Seek to last
- for (int j = n - 1; backwards.valid(); j--, backwards.retract()) {
+ for(int j = n - 1; backwards.valid(); j--, backwards.retract()) {
p2[j] = backwards.getValue() + ((j + 1 < n) ? p2[j + 1] : 0);
mv.put(j, backwards.getValue());
mu2[j] = mv.getMean();
@@ -412,7 +412,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
}
}
- for (int i = 0; i < n; i++) {
+ for(int i = 0; i < n; i++) {
jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i])));
}
@@ -420,23 +420,23 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
double bestgoodness = Double.NEGATIVE_INFINITY;
double devPrev = jt[1] - jt[0];
- for (int i = 1; i < jt.length - 1; i++) {
+ for(int i = 1; i < jt.length - 1; i++) {
double devCur = jt[i + 1] - jt[i];
// System.out.println(p1[i]);
// System.out.println(jt[i + 1]);
// System.out.println(jt[i]);
// System.out.println(devCur);
// Local minimum found - calculate depth
- if (devCur >= 0 && devPrev <= 0) {
+ if(devCur >= 0 && devPrev <= 0) {
double lowestMaxima = Double.POSITIVE_INFINITY;
- for (int j = i - 1; j > 0; j--) {
- if (jt[j - 1] < jt[j]) {
+ for(int j = i - 1; j > 0; j--) {
+ if(jt[j - 1] < jt[j]) {
lowestMaxima = Math.min(lowestMaxima, jt[j]);
break;
}
}
- for (int j = i + 1; j < n - 2; j++) {
- if (jt[j + 1] < jt[j]) {
+ for(int j = i + 1; j < n - 2; j++) {
+ if(jt[j + 1] < jt[j]) {
lowestMaxima = Math.min(lowestMaxima, jt[j]);
break;
}
@@ -445,11 +445,11 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
final double mud = mu1[i] - mu2[i];
double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]);
- if (Double.isNaN(discriminability)) {
+ if(Double.isNaN(discriminability)) {
discriminability = -1;
}
double goodness = localDepth * discriminability;
- if (goodness > bestgoodness) {
+ if(goodness > bestgoodness) {
bestgoodness = goodness;
bestpos = i;
}
@@ -552,7 +552,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
* Threshold
*/
private double threshold;
-
+
/**
* Random generator
*/
@@ -562,26 +562,26 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter maxLMDimP = new IntParameter(MAXDIM_ID);
- maxLMDimP.addConstraint(new GreaterEqualConstraint(1));
+ maxLMDimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
maxLMDimP.setOptional(true);
- if (config.grab(maxLMDimP)) {
+ if(config.grab(maxLMDimP)) {
maxdim = maxLMDimP.getValue();
}
IntParameter minsizeP = new IntParameter(MINSIZE_ID);
- minsizeP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(minsizeP)) {
+ minsizeP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minsizeP)) {
minsize = minsizeP.getValue();
}
IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100);
- if (config.grab(samplingLevelP)) {
+ if(config.grab(samplingLevelP)) {
samplingLevel = samplingLevelP.getValue();
}
DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID);
- if (config.grab(sensivityThresholdP)) {
+ if(config.grab(sensivityThresholdP)) {
threshold = sensivityThresholdP.getValue();
}
RandomParameter rndP = new RandomParameter(RANDOM_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
index a9c67a58..7733ddaa 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
@@ -61,8 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -135,7 +134,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// current dimensionality associated with each seed
int dim_c = RelationUtil.dimensionality(relation);
- if (dim_c < l) {
+ if(dim_c < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + dim_c + " < " + l + ")");
}
@@ -149,8 +148,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null;
- while (k_c > k) {
- if (cprogress != null) {
+ while(k_c > k) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
@@ -158,8 +157,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
assign(relation, distFunc, clusters);
// determine current subspace associated with each cluster
- for (ORCLUSCluster cluster : clusters) {
- if (cluster.objectIDs.size() > 0) {
+ for(ORCLUSCluster cluster : clusters) {
+ if(cluster.objectIDs.size() > 0) {
cluster.basis = findBasis(relation, distFunc, cluster, dim_c);
}
}
@@ -172,18 +171,19 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
}
assign(relation, distFunc, clusters);
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size());
cprogress.setCompleted(LOG);
}
// get the result
Clustering<Model> r = new Clustering<>("ORCLUS clustering", "orclus-clustering");
- for (ORCLUSCluster c : clusters) {
+ for(ORCLUSCluster c : clusters) {
r.addToplevelCluster(new Cluster<Model>(c.objectIDs, ClusterModel.CLUSTER));
}
return r;
- } catch (Exception e) {
+ }
+ catch(Exception e) {
throw new IllegalStateException(e);
}
}
@@ -199,7 +199,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, rnd);
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database);
List<ORCLUSCluster> seeds = new ArrayList<>();
- for (DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) {
seeds.add(new ORCLUSCluster(database.get(iter), iter, factory));
}
return seeds;
@@ -217,29 +217,29 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
private void assign(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters) {
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database);
// clear the current clusters
- for (ORCLUSCluster cluster : clusters) {
+ for(ORCLUSCluster cluster : clusters) {
cluster.objectIDs.clear();
}
// projected centroids of the clusters
List<V> projectedCentroids = new ArrayList<>(clusters.size());
- for (ORCLUSCluster c : clusters) {
+ for(ORCLUSCluster c : clusters) {
projectedCentroids.add(projection(c, c.centroid, factory));
}
// for each data point o do
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
V o = database.get(it);
DoubleDistance minDist = null;
ORCLUSCluster minCluster = null;
// determine projected distance between o and cluster
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < clusters.size(); i++) {
ORCLUSCluster c = clusters.get(i);
V o_proj = projection(c, o, factory);
DoubleDistance dist = distFunc.distance(o_proj, projectedCentroids.get(i));
- if (minDist == null || minDist.compareTo(dist) > 0) {
+ if(minDist == null || minDist.compareTo(dist) > 0) {
minDist = dist;
minCluster = c;
}
@@ -250,8 +250,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
}
// recompute the seed in each clusters
- for (ORCLUSCluster cluster : clusters) {
- if (cluster.objectIDs.size() > 0) {
+ for(ORCLUSCluster cluster : clusters) {
+ if(cluster.objectIDs.size() > 0) {
cluster.centroid = Centroid.make(database, cluster.objectIDs).toVector(database);
}
}
@@ -271,7 +271,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// covariance matrix of cluster
// Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs);
GenericDistanceDBIDList<DoubleDistance> results = new GenericDistanceDBIDList<>(cluster.objectIDs.size());
- for (DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) {
+ for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) {
DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it));
results.add(distance, it);
}
@@ -304,9 +304,9 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
*/
private void merge(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters, int k_new, int d_new, IndefiniteProgress cprogress) {
ArrayList<ProjectedEnergy> projectedEnergies = new ArrayList<>();
- for (int i = 0; i < clusters.size(); i++) {
- for (int j = 0; j < clusters.size(); j++) {
- if (i >= j) {
+ for(int i = 0; i < clusters.size(); i++) {
+ for(int j = 0; j < clusters.size(); j++) {
+ if(i >= j) {
continue;
}
// projected energy of c_ij in subspace e_ij
@@ -318,8 +318,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
}
}
- while (clusters.size() > k_new) {
- if (cprogress != null) {
+ while(clusters.size() > k_new) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
// find the smallest value of r_ij
@@ -327,12 +327,12 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// renumber the clusters by replacing cluster c_i with cluster c_ij
// and discarding cluster c_j
- for (int c = 0; c < clusters.size(); c++) {
- if (c == minPE.i) {
+ for(int c = 0; c < clusters.size(); c++) {
+ if(c == minPE.i) {
clusters.remove(c);
clusters.add(c, minPE.cluster);
}
- if (c == minPE.j) {
+ if(c == minPE.j) {
clusters.remove(c);
}
}
@@ -341,15 +341,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
int i = minPE.i;
int j = minPE.j;
Iterator<ProjectedEnergy> it = projectedEnergies.iterator();
- while (it.hasNext()) {
+ while(it.hasNext()) {
ProjectedEnergy pe = it.next();
- if (pe.i == i || pe.i == j || pe.j == i || pe.j == j) {
+ if(pe.i == i || pe.i == j || pe.j == i || pe.j == j) {
it.remove();
- } else {
- if (pe.i > j) {
+ }
+ else {
+ if(pe.i > j) {
pe.i -= 1;
}
- if (pe.j > j) {
+ if(pe.j > j) {
pe.j -= 1;
}
}
@@ -357,10 +358,11 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// ... and recompute them
ORCLUSCluster c_ij = minPE.cluster;
- for (int c = 0; c < clusters.size(); c++) {
- if (c < i) {
+ for(int c = 0; c < clusters.size(); c++) {
+ if(c < i) {
projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, c, i, d_new));
- } else if (c > i) {
+ }
+ else if(c > i) {
projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, i, c, d_new));
}
}
@@ -389,7 +391,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
double sum = 0.;
V c_proj = projection(c_ij, c_ij.centroid, factory);
- for (DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) {
V o_proj = projection(c_ij, database.get(iter), factory);
double dist = distFunc.distance(o_proj, c_proj).doubleValue();
sum += dist * dist;
@@ -417,15 +419,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// convert into array.
c.objectIDs = DBIDUtil.newArray(c.objectIDs);
- if (c.objectIDs.size() > 0) {
+ if(c.objectIDs.size() > 0) {
c.centroid = Centroid.make(relation, c.objectIDs).toVector(relation);
c.basis = findBasis(relation, distFunc, c, dim);
- } else {
+ }
+ else {
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5);
c.centroid = factory.newNumberVector(cent.getArrayRef());
double[][] doubles = new double[c1.basis.getRowDimensionality()][dim];
- for (int i = 0; i < dim; i++) {
+ for(int i = 0; i < dim; i++) {
doubles[i][i] = 1;
}
c.basis = new Matrix(doubles);
@@ -590,16 +593,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
protected void configAlpha(Parameterization config) {
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5);
- alphaP.addConstraint(new GreaterConstraint(0));
- alphaP.addConstraint(new LessEqualConstraint(1));
- if (config.grab(alphaP)) {
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
}
protected void configSeed(Parameterization config) {
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
index 545a8171..1b316c7c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
@@ -23,7 +23,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
@@ -67,12 +68,12 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh
/**
* Range to query with
*/
- D epsilon;
+ protected D epsilon;
/**
* Distance function to use
*/
- DistanceFunction<O, D> distFunc;
+ protected DistanceFunction<O, D> distFunc;
/**
* Full constructor.
@@ -177,14 +178,14 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
// Get a distance function.
- ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
+ ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
D distanceFactory = null;
if(config.grab(distanceP)) {
distfun = distanceP.instantiateClass(config);
distanceFactory = distfun.getDistanceFactory();
}
// Get the epsilon parameter
- DistanceParameter<D> epsilonP = new DistanceParameter<>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory);
+ DistanceParameter<D> epsilonP = new DistanceParameter<>(DBSCAN.Parameterizer.EPSILON_ID, distanceFactory);
if(config.grab(epsilonP)) {
epsilon = epsilonP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
index a6e62e2e..ac7ba81d 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
@@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
@@ -54,7 +55,7 @@ public class MinPtsCorePredicate implements CorePredicate {
/**
* The minpts parameter.
*/
- int minpts;
+ protected int minpts;
/**
* Default constructor.
@@ -127,7 +128,7 @@ public class MinPtsCorePredicate implements CorePredicate {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
// Get the minpts parameter
- IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID);
+ IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java
index ac5cb77c..f6dbc88f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
@@ -178,9 +178,10 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DataStore<D> lambda = pointerresult.getParentDistanceStore();
Clustering<DendrogramModel<D>> result;
- if (lambda instanceof DoubleDistanceDataStore) {
+ if(lambda instanceof DoubleDistanceDataStore) {
result = extractClustersDouble(ids, pi, (DoubleDistanceDataStore) lambda);
- } else {
+ }
+ else {
result = extractClusters(ids, pi, lambda);
}
result.addChildResult(pointerresult);
@@ -208,28 +209,31 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDArrayIter it = order.iter(); // Used multiple times!
int split;
- if (minclusters > 0) {
+ if(minclusters > 0) {
split = Math.max(ids.size() - minclusters, 0);
// Stop distance:
final D stopdist = lambda.get(order.get(split));
// Tie handling: decrement split.
- while (split > 0) {
+ while(split > 0) {
it.seek(split - 1);
- if (stopdist.compareTo(lambda.get(it)) <= 0) {
+ if(stopdist.compareTo(lambda.get(it)) <= 0) {
split--;
- } else {
+ }
+ else {
break;
}
}
- } else if (threshold != null) {
+ }
+ else if(threshold != null) {
split = ids.size();
it.seek(split - 1);
- while (threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) {
+ while(threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) {
split--;
it.retract();
}
- } else { // full hierarchy
+ }
+ else { // full hierarchy
split = 0;
}
@@ -242,19 +246,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDVar succ = DBIDUtil.newVar(); // Variable for successor.
// Go backwards on the lower part.
- for (it.seek(split - 1); it.valid(); it.retract()) {
+ for(it.seek(split - 1); it.valid(); it.retract()) {
D dist = lambda.get(it); // Distance to successor
pi.assignVar(it, succ); // succ = pi(it)
int clusterid = cluster_map.intValue(succ);
// Successor cluster has already been created:
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
cluster_dbids.get(clusterid).add(it);
cluster_map.putInt(it, clusterid);
// Update distance to maximum encountered:
- if (cluster_dist.get(clusterid).compareTo(dist) < 0) {
+ if(cluster_dist.get(clusterid).compareTo(dist) < 0) {
cluster_dist.set(clusterid, dist);
}
- } else {
+ }
+ else {
// Need to start a new cluster:
clusterid = cluster_dbids.size(); // next cluster number.
ModifiableDBIDs cids = DBIDUtil.newArray();
@@ -270,12 +275,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
final Clustering<DendrogramModel<D>> dendrogram;
- switch(outputmode) {
+ switch(outputmode){
case PARTIAL_HIERARCHY: {
// Build a hierarchy out of these clusters.
dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering");
@@ -284,74 +289,81 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i)));
}
cluster_dist = null; // Invalidate
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
// The current cluster led by the current element:
final Cluster<DendrogramModel<D>> clus;
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
clus = clusters.get(clusterid);
- } else if (!singletons && ids.size() != 1) {
+ }
+ else if(!singletons && ids.size() != 1) {
clus = null;
- } else {
+ }
+ else {
clus = makeCluster(it, null, DBIDUtil.deref(it));
}
// The successor to join:
pi.assignVar(it, succ); // succ = pi(it)
- if (DBIDUtil.equal(it, succ)) {
+ if(DBIDUtil.equal(it, succ)) {
assert (root == null);
root = clus;
- } else {
+ }
+ else {
// Parent cluster:
int parentid = cluster_map.intValue(succ);
D depth = lambda.get(it);
// Parent cluster exists - merge as a new cluster:
- if (parentid >= 0) {
+ if(parentid >= 0) {
final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid);
- if (pclus.getModel().getDistance().equals(depth)) {
- if (clus == null) {
+ if(pclus.getModel().getDistance().equals(depth)) {
+ if(clus == null) {
((ModifiableDBIDs) pclus.getIDs()).add(it);
- } else {
+ }
+ else {
dendrogram.addChildCluster(pclus, clus);
}
- } else {
+ }
+ else {
// Merge at new depth:
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids);
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(npclus, clus);
}
dendrogram.addChildCluster(npclus, pclus);
// Replace existing parent cluster: new depth
clusters.set(parentid, npclus);
}
- } else {
+ }
+ else {
// Merge with parent at this depth:
final Cluster<DendrogramModel<D>> pclus;
- if (!singletons) {
+ if(!singletons) {
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1);
cids.add(succ);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
// New cluster for parent and/or new point
pclus = makeCluster(succ, depth, cids);
- } else {
+ }
+ else {
// Create a new, one-element cluster for parent, and a merged
// cluster on top.
pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS);
dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ)));
}
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(pclus, clus);
}
// Store cluster:
@@ -362,7 +374,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -377,21 +389,21 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
dendrogram.addToplevelCluster(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i)));
}
cluster_dist = null; // Invalidate
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
- if (clusterid < 0) {
+ if(clusterid < 0) {
dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it)));
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -401,7 +413,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
throw new AbortException("Unsupported output mode.");
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
@@ -428,29 +440,32 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDArrayIter it = order.iter(); // Used multiple times!
int split;
- if (minclusters > 0) {
+ if(minclusters > 0) {
split = Math.max(ids.size() - minclusters, 0);
// Stop distance:
final double stopdist = lambda.doubleValue(order.get(split));
// Tie handling: decrement split.
- while (split > 0) {
+ while(split > 0) {
it.seek(split - 1);
- if (stopdist <= lambda.doubleValue(it)) {
+ if(stopdist <= lambda.doubleValue(it)) {
split--;
- } else {
+ }
+ else {
break;
}
}
- } else if (threshold != null) {
+ }
+ else if(threshold != null) {
split = ids.size();
it.seek(split - 1);
double stopdist = ((DoubleDistance) threshold).doubleValue();
- while (stopdist <= lambda.doubleValue(it) && it.valid()) {
+ while(stopdist <= lambda.doubleValue(it) && it.valid()) {
split--;
it.retract();
}
- } else { // full hierarchy
+ }
+ else { // full hierarchy
split = 0;
}
@@ -463,19 +478,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDVar succ = DBIDUtil.newVar(); // Variable for successor.
// Go backwards on the lower part.
- for (it.seek(split - 1); it.valid(); it.retract()) {
+ for(it.seek(split - 1); it.valid(); it.retract()) {
double dist = lambda.doubleValue(it); // Distance to successor
pi.assignVar(it, succ); // succ = pi(it)
int clusterid = cluster_map.intValue(succ);
// Successor cluster has already been created:
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
cluster_dbids.get(clusterid).add(it);
cluster_map.putInt(it, clusterid);
// Update distance to maximum encountered:
- if (cluster_dist.get(clusterid) < dist) {
+ if(cluster_dist.get(clusterid) < dist) {
cluster_dist.set(clusterid, dist);
}
- } else {
+ }
+ else {
// Need to start a new cluster:
clusterid = cluster_dbids.size(); // next cluster number.
ModifiableDBIDs cids = DBIDUtil.newArray();
@@ -491,12 +507,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
final Clustering<DendrogramModel<D>> dendrogram;
- switch(outputmode) {
+ switch(outputmode){
case PARTIAL_HIERARCHY: {
// Build a hierarchy out of these clusters.
dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering");
@@ -505,7 +521,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
@SuppressWarnings("unchecked")
D depth = (D) new DoubleDistance(cluster_dist.get(i));
clusters.add(makeCluster(it2, depth, cluster_dbids.get(i)));
@@ -514,68 +530,75 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
// The current cluster led by the current element:
final Cluster<DendrogramModel<D>> clus;
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
clus = clusters.get(clusterid);
- } else if (!singletons && ids.size() != 1) {
+ }
+ else if(!singletons && ids.size() != 1) {
clus = null;
- } else {
+ }
+ else {
clus = makeCluster(it, null, DBIDUtil.deref(it));
}
// The successor to join:
pi.assignVar(it, succ); // succ = pi(it)
- if (DBIDUtil.equal(it, succ)) {
+ if(DBIDUtil.equal(it, succ)) {
assert (root == null);
root = clus;
- } else {
+ }
+ else {
// Parent cluster:
int parentid = cluster_map.intValue(succ);
@SuppressWarnings("unchecked")
D depth = (D) new DoubleDistance(lambda.doubleValue(it));
// Parent cluster exists - merge as a new cluster:
- if (parentid >= 0) {
+ if(parentid >= 0) {
final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid);
- if (pclus.getModel().getDistance().equals(depth)) {
- if (clus == null) {
+ if(pclus.getModel().getDistance().equals(depth)) {
+ if(clus == null) {
((ModifiableDBIDs) pclus.getIDs()).add(it);
- } else {
+ }
+ else {
dendrogram.addChildCluster(pclus, clus);
}
- } else {
+ }
+ else {
// Merge at new depth:
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids);
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(npclus, clus);
}
dendrogram.addChildCluster(npclus, pclus);
// Replace existing parent cluster: new depth
clusters.set(parentid, npclus);
}
- } else {
+ }
+ else {
// Merge with parent at this depth:
final Cluster<DendrogramModel<D>> pclus;
- if (!singletons) {
+ if(!singletons) {
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1);
cids.add(succ);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
// New cluster for parent and/or new point
pclus = makeCluster(succ, depth, cids);
- } else {
+ }
+ else {
// Create a new, one-element cluster for parent, and a merged
// cluster on top.
pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS);
dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ)));
}
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(pclus, clus);
}
// Store cluster:
@@ -586,7 +609,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -601,7 +624,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
@SuppressWarnings("unchecked")
D depth = (D) new DoubleDistance(cluster_dist.get(i));
dendrogram.addToplevelCluster(makeCluster(it2, depth, cluster_dbids.get(i)));
@@ -610,14 +633,14 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
- if (clusterid < 0) {
+ if(clusterid < 0) {
dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it)));
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -627,7 +650,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
throw new AbortException("Unsupported output mode.");
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
@@ -644,13 +667,16 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
*/
private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members) {
final String name;
- if (members.size() == 0) {
+ if(members.size() == 0) {
name = "mrg_" + DBIDUtil.toString(lead) + "_" + depth;
- } else if (depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) {
+ }
+ else if(depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) {
name = "obj_" + DBIDUtil.toString(lead);
- } else if (depth != null) {
+ }
+ else if(depth != null) {
name = "clu_" + DBIDUtil.toString(lead) + "_" + depth;
- } else {
+ }
+ else {
// Complete data set only?
name = "clu_" + DBIDUtil.toString(lead);
}
@@ -794,53 +820,54 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<HierarchicalClusteringAlgorithm<D>> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class);
- if (config.grab(algorithmP)) {
+ if(config.grab(algorithmP)) {
algorithm = algorithmP.instantiateClass(config);
}
EnumParameter<ThresholdMode> modeP = new EnumParameter<>(MODE_ID, ThresholdMode.class, ThresholdMode.BY_MINCLUSTERS);
- if (config.grab(modeP)) {
+ if(config.grab(modeP)) {
thresholdmode = modeP.getValue();
}
- if (thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) {
+ if(thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) {
IntParameter minclustersP = new IntParameter(MINCLUSTERS_ID);
- minclustersP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(minclustersP)) {
+ minclustersP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minclustersP)) {
minclusters = minclustersP.intValue();
}
}
- if (thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) {
+ if(thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) {
// Fallback to double when no algorithm chosen yet:
@SuppressWarnings("unchecked")
final D factory = algorithm != null ? algorithm.getDistanceFactory() : (D) DoubleDistance.FACTORY;
DistanceParameter<D> distP = new DistanceParameter<>(THRESHOLD_ID, factory);
- if (config.grab(distP)) {
+ if(config.grab(distP)) {
threshold = distP.getValue();
}
}
- if (thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) {
+ if(thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) {
EnumParameter<OutputMode> outputP = new EnumParameter<>(OUTPUTMODE_ID, OutputMode.class);
- if (config.grab(outputP)) {
+ if(config.grab(outputP)) {
outputmode = outputP.getValue();
}
- } else {
+ }
+ else {
// This becomes full hierarchy:
minclusters = -1;
outputmode = OutputMode.PARTIAL_HIERARCHY;
}
Flag singletonsF = new Flag(SINGLETONS_ID);
- if (config.grab(singletonsF)) {
+ if(config.grab(singletonsF)) {
singletons = singletonsF.isTrue();
}
}
@Override
protected ExtractFlatClusteringFromHierarchy<D> makeInstance() {
- switch(thresholdmode) {
+ switch(thresholdmode){
case NO_THRESHOLD:
case BY_MINCLUSTERS:
return new ExtractFlatClusteringFromHierarchy<>(algorithm, minclusters, outputmode, singletons);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index dc1fa47c..5754e961 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.MeanModel;
import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
@@ -49,8 +50,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -105,68 +105,61 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param relation the database to cluster
* @param means a list of k means
* @param clusters cluster assignment
+ * @param assignment Current cluster assignment
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) {
+ protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
boolean changed = false;
- if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
- if (dist < mindist) {
+ if(dist < mindist) {
minIndex = i;
mindist = dist;
}
}
- if (clusters.get(minIndex).add(iditer)) {
- changed = true;
- // Remove from previous cluster
- // TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
- break;
- }
- }
- }
- }
+ changed |= updateAssignment(iditer, clusters, assignment, minIndex);
}
- } else {
+ }
+ else {
final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
- if (dist.compareTo(mindist) < 0) {
+ if(dist.compareTo(mindist) < 0) {
minIndex = i;
mindist = dist;
}
}
- if (clusters.get(minIndex).add(iditer)) {
- changed = true;
- // Remove from previous cluster
- // TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
- break;
- }
- }
- }
- }
+ changed |= updateAssignment(iditer, clusters, assignment, minIndex);
}
}
return changed;
}
+ protected boolean updateAssignment(DBIDIter iditer, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, int newA) {
+ final int oldA = assignment.intValue(iditer);
+ if(oldA == newA) {
+ return false;
+ }
+ clusters.get(newA).add(iditer);
+ assignment.putInt(iditer, newA);
+ if(oldA >= 0) {
+ clusters.get(oldA).remove(iditer);
+ }
+ return true;
+ }
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(new CombinedTypeInformation(TypeUtil.NUMBER_VECTOR_FIELD, getDistanceFunction().getInputTypeRestriction()));
@@ -181,24 +174,28 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @return the mean vectors of the given clusters in the given database
*/
protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) {
+ // TODO: use Kahan summation for better numerical precision?
List<Vector> newMeans = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
ModifiableDBIDs list = clusters.get(i);
Vector mean = null;
- if (list.size() > 0) {
- double s = 1.0 / list.size();
+ if(list.size() > 0) {
DBIDIter iter = list.iter();
- assert (iter.valid());
- mean = database.get(iter).getColumnVector().timesEquals(s);
+ // Initialize with first.
+ mean = database.get(iter).getColumnVector();
double[] raw = mean.getArrayRef();
iter.advance();
- for (; iter.valid(); iter.advance()) {
+ // Update with remaining instances
+ for(; iter.valid(); iter.advance()) {
NumberVector<?> vec = database.get(iter);
- for (int j = 0; j < mean.getDimensionality(); j++) {
- raw[j] += s * vec.doubleValue(j);
+ for(int j = 0; j < mean.getDimensionality(); j++) {
+ raw[j] += vec.doubleValue(j);
}
}
- } else {
+ mean.timesEquals(1.0 / list.size());
+ }
+ else {
+ // Keep degenerated means as-is for now.
mean = means.get(i).getColumnVector();
}
newMeans.add(mean);
@@ -218,17 +215,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
final int dim = medians.get(0).getDimensionality();
final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database);
List<NumberVector<?>> newMedians = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i));
- if (list.size() > 0) {
+ if(list.size() > 0) {
Vector mean = new Vector(dim);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
sorter.setDimension(d);
DBID id = QuickSelect.median(list, sorter);
mean.set(d, database.get(id).doubleValue(d));
}
newMedians.add(mean);
- } else {
+ }
+ else {
newMedians.add((NumberVector<?>) medians.get(i));
}
}
@@ -244,14 +242,11 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param op Cluster size change / Weight change
*/
protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) {
- if (newsize == 0) {
+ if(newsize == 0) {
return; // Keep old mean
}
- Vector delta = vec.getColumnVector();
- // Compute difference from mean
- delta.minusEquals(mean);
- delta.timesEquals(op / newsize);
- mean.plusEquals(delta);
+ Vector delta = vec.getColumnVector().minusEquals(mean);
+ mean.plusTimesEquals(delta, op / newsize);
}
/**
@@ -260,76 +255,84 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param relation Relation
* @param means Means
* @param clusters Clusters
+ * @param assignment Current cluster assignment
* @return true when the means have changed
*/
- protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
+ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
boolean changed = false;
- if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
// Raw distance function
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
// Incremental update
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
- if (dist < mindist) {
+ if(dist < mindist) {
minIndex = i;
mindist = dist;
}
}
- // Update the cluster mean incrementally:
- for (int i = 0; i < k; i++) {
- ModifiableDBIDs ci = clusters.get(i);
- if (i == minIndex) {
- if (ci.add(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
- changed = true;
- }
- } else if (ci.remove(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
- changed = true;
- }
- }
+ changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
}
- } else {
+ }
+ else {
// Raw distance function
final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
// Incremental update
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
- if (dist.compareTo(mindist) < 0) {
+ if(dist.compareTo(mindist) < 0) {
minIndex = i;
mindist = dist;
}
}
- // Update the cluster mean incrementally:
- for (int i = 0; i < k; i++) {
- ModifiableDBIDs ci = clusters.get(i);
- if (i == minIndex) {
- if (ci.add(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
- changed = true;
- }
- } else if (ci.remove(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
- changed = true;
- }
- }
+ changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
}
}
return changed;
}
+ /**
+ * Try to update the cluster assignment.
+ *
+ * @param clusters Current clusters
+ * @param means Means to update
+ * @param minIndex Cluster to assign to
+ * @param fv Vector
+ * @param iditer Object ID
+ * @param assignment Current cluster assignment
+ * @return {@code true} when assignment changed
+ */
+ private boolean updateMeanAndAssignment(List<ModifiableDBIDs> clusters, List<Vector> means, int minIndex, V fv, DBIDIter iditer, WritableIntegerDataStore assignment) {
+ int cur = assignment.intValue(iditer);
+ if(cur == minIndex) {
+ return false;
+ }
+ final ModifiableDBIDs curclus = clusters.get(minIndex);
+ curclus.add(iditer);
+ incrementalUpdateMean(means.get(minIndex), fv, curclus.size(), +1);
+
+ if(cur >= 0) {
+ ModifiableDBIDs ci = clusters.get(cur);
+ ci.remove(iditer);
+ incrementalUpdateMean(means.get(cur), fv, ci.size() + 1, -1);
+ }
+
+ assignment.putInt(iditer, minIndex);
+ return true;
+ }
+
@Override
public void setK(int k) {
this.k = k;
@@ -366,27 +369,27 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
@Override
protected void makeOptions(Parameterization config) {
ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
- if (config.grab(distanceFunctionP)) {
+ if(config.grab(distanceFunctionP)) {
distanceFunction = distanceFunctionP.instantiateClass(config);
- if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
+ if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
}
}
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
index 30bb640c..51e7ace9 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
@@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -90,34 +90,35 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
- if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
+ if(!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass());
}
final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction();
Clustering<M> bestResult = null;
- if (trials > 1) {
+ if(trials > 1) {
double bestCost = Double.POSITIVE_INFINITY;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
- for (int i = 0; i < trials; i++) {
+ for(int i = 0; i < trials; i++) {
Clustering<M> currentCandidate = innerkMeans.run(database, relation);
double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Cost of candidate " + i + ": " + currentCost);
}
- if (currentCost < bestCost) {
+ if(currentCost < bestCost) {
bestResult = currentCandidate;
bestCost = currentCost;
}
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
- } else {
+ }
+ else {
bestResult = innerkMeans.run(database);
}
@@ -195,18 +196,18 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
@Override
protected void makeOptions(Parameterization config) {
IntParameter trialsP = new IntParameter(TRIALS_ID);
- trialsP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(trialsP)) {
+ trialsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(trialsP)) {
trials = trialsP.intValue();
}
ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
- if (config.grab(kMeansVariantP)) {
+ if(config.grab(kMeansVariantP)) {
kMeansVariant = kMeansVariantP.instantiateClass(config);
}
ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class);
- if (config.grab(qualityMeasureP)) {
+ if(config.grab(qualityMeasureP)) {
qualityMeasure = qualityMeasureP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
index a018c04b..9edfd816 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
*/
import java.util.ArrayList;
import java.util.List;
-import java.util.Random;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.Database;
@@ -74,7 +73,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
@Override
public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
// Get a distance query
- if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
@@ -84,26 +83,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
List<V> means = new ArrayList<>(k);
- Random random = rnd.getRandom();
- DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter();
+ DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter();
means.add(relation.get(first));
DBIDVar best = DBIDUtil.newVar(first);
- for (int i = (dropfirst ? 0 : 1); i < k; i++) {
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
// Find farthest object:
double maxdist = Double.NEGATIVE_INFINITY;
- for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
double dsum = 0.;
- for (V ex : means) {
+ for(V ex : means) {
dsum += distQ.distance(ex, it).doubleValue();
}
- if (dsum > maxdist) {
+ if(dsum > maxdist) {
maxdist = dsum;
best.set(it);
}
}
// Add new mean:
- if (k == 0) {
+ if(k == 0) {
means.clear(); // Remove temporary first element.
}
means.add(relation.get(best));
@@ -114,7 +112,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
@Override
public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
- if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
@@ -123,26 +121,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
- Random random = rnd.getRandom();
- DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter();
+ DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter();
means.add(first);
DBIDVar best = DBIDUtil.newVar(first);
- for (int i = (dropfirst ? 0 : 1); i < k; i++) {
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
// Find farthest object:
double maxdist = Double.NEGATIVE_INFINITY;
- for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
double dsum = 0.;
- for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) {
+ for(DBIDIter ex = means.iter(); ex.valid(); ex.advance()) {
dsum += distQ.distance(ex, it).doubleValue();
}
- if (dsum > maxdist) {
+ if(dsum > maxdist) {
maxdist = dsum;
best.set(it);
}
}
// Add new mean:
- if (k == 0) {
+ if(k == 0) {
means.clear(); // Remove temporary first element.
}
means.add(best);
@@ -173,7 +170,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
Flag dropfirstP = new Flag(DROPFIRST_ID);
- if (config.grab(dropfirstP)) {
+ if(config.grab(dropfirstP)) {
dropfirst = dropfirstP.isTrue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
new file mode 100644
index 00000000..aec4fe0f
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
@@ -0,0 +1,346 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * Provides the k-means algorithm, using Lloyd-style bulk iterations.
+ *
+ * However, in contrast to Lloyd's k-means and similar to MacQueen, we do update
+ * the mean vectors multiple times, not only at the very end of the iteration.
+ * This should yield faster convergence at little extra cost.
+ *
+ * To avoid issues with ordered data, we use random sampling to obtain the data
+ * blocks.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansBatchedLloyd.class);
+
+ /**
+ * Number of blocks to use.
+ */
+ int blocks;
+
+ /**
+ * Random used for partitioning.
+ */
+ RandomFactory random;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ * @param blocks Number of blocks
+ * @param random Random factory used for partitioning.
+ */
+ public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer, int blocks, RandomFactory random) {
+ super(distanceFunction, k, maxiter, initializer);
+ this.blocks = blocks;
+ this.random = random;
+ }
+
+ @Override
+ public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+ // Choose initial means
+ List<? extends NumberVector<?>> mvs = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
+ // Convert to (modifiable) math vectors.
+ List<Vector> means = new ArrayList<>(k);
+ for (NumberVector<?> m : mvs) {
+ means.add(m.getColumnVector());
+ }
+
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for (int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+
+ ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), blocks, random);
+
+ double[][] meanshift = new double[k][dim];
+ int[] changesize = new int[k];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ boolean changed = false;
+ FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null;
+ for (int p = 0; p < parts.length; p++) {
+ // Initialize new means scratch space.
+ for (int i = 0; i < k; i++) {
+ Arrays.fill(meanshift[i], 0.);
+ }
+ Arrays.fill(changesize, 0);
+ changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment);
+ // Recompute means.
+ updateMeans(means, meanshift, clusters, changesize);
+ if (pprog != null) {
+ pprog.incrementProcessed(LOG);
+ }
+ }
+ if (pprog != null) {
+ pprog.ensureCompleted(LOG);
+ }
+ // Stop if no cluster assignment changed.
+ if (!changed) {
+ break;
+ }
+ }
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
+
+ // Wrap result
+ final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
+ Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for (int i = 0; i < clusters.size(); i++) {
+ KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param relation the database to cluster
+ * @param ids IDs to process
+ * @param oldmeans a list of k means
+ * @param meanshift delta to apply to each mean
+ * @param changesize New cluster sizes
+ * @param clusters cluster assignment
+ * @param assignment Current cluster assignment
+ * @return true when the object was reassigned
+ */
+ protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector<?>> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
+ boolean changed = false;
+
+ if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
+ for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for (int i = 0; i < k; i++) {
+ double dist = df.doubleDistance(fv, oldmeans.get(i));
+ if (dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
+ }
+ } else {
+ final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
+ for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ D mindist = df.getDistanceFactory().infiniteDistance();
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for (int i = 0; i < k; i++) {
+ D dist = df.distance(fv, oldmeans.get(i));
+ if (dist.compareTo(mindist) < 0) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
+ }
+ }
+ return changed;
+ }
+
+ /**
+ * Update the assignment of a single object.
+ *
+ * @param id Object to assign
+ * @param fv Vector
+ * @param clusters Clusters
+ * @param assignment Current cluster assignment
+ * @param meanshift Current shifting offset
+ * @param changesize Size change of the current cluster
+ * @param minIndex Index of best cluster.
+ * @return {@code true} when assignment changed.
+ */
+ protected boolean updateAssignment(DBIDIter id, V fv, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[][] meanshift, int[] changesize, int minIndex) {
+ int cur = assignment.intValue(id);
+ if (cur == minIndex) {
+ return false;
+ }
+ // Add to new cluster.
+ {
+ clusters.get(minIndex).add(id);
+ changesize[minIndex]++;
+ double[] raw = meanshift[minIndex];
+ for (int j = 0; j < fv.getDimensionality(); j++) {
+ raw[j] += fv.doubleValue(j);
+ }
+ }
+ // Remove from previous cluster
+ if (cur >= 0) {
+ clusters.get(cur).remove(id);
+ changesize[cur]--;
+ double[] raw = meanshift[cur];
+ for (int j = 0; j < fv.getDimensionality(); j++) {
+ raw[j] -= fv.doubleValue(j);
+ }
+ }
+ assignment.putInt(id, minIndex);
+ return true;
+ }
+
+ /**
+ * Merge changes into mean vectors.
+ *
+ * @param means Mean vectors
+ * @param meanshift Shift offset
+ * @param clusters
+ * @param changesize Size of change (for weighting!)
+ */
+ protected void updateMeans(List<Vector> means, double[][] meanshift, List<ModifiableDBIDs> clusters, int[] changesize) {
+ for (int i = 0; i < k; i++) {
+ int newsize = clusters.get(i).size(), oldsize = newsize - changesize[i];
+ if (newsize == 0) {
+ continue; // Keep previous mean vector.
+ }
+ if (oldsize == 0) {
+ means.set(i, new Vector(meanshift[i]).times(1. / newsize));
+ continue; // Replace with new vector.
+ }
+ if (oldsize == newsize) {
+ means.get(i).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize);
+ continue;
+ }
+ means.get(i).timesEquals(oldsize / (double) newsize).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize);
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ /**
+ * Parameter for the number of blocks.
+ */
+ public static final OptionID BLOCKS_ID = new OptionID("kmeans.blocks", "Number of blocks to use for processing. Means will be recomputed after each block.");
+
+ /**
+ * Random source for blocking.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("kmeans.blocks.random", "Random source for producing blocks.");
+
+ /**
+ * Number of blocks.
+ */
+ int blocks;
+
+ /**
+ * Random used for partitioning.
+ */
+ RandomFactory random;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter blocksP = new IntParameter(BLOCKS_ID, 10);
+ blocksP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if (config.grab(blocksP)) {
+ blocks = blocksP.intValue();
+ }
+ RandomParameter randomP = new RandomParameter(RANDOM_ID);
+ if (config.grab(randomP)) {
+ random = randomP.getValue();
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected KMeansBatchedLloyd<V, D> makeInstance() {
+ return new KMeansBatchedLloyd<>(distanceFunction, k, maxiter, initializer, blocks, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
index 37071d36..80a581b1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
@@ -41,7 +41,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -205,7 +205,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
super.makeOptions(config);
IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(new GreaterConstraint(1));
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if (config.grab(kP)) {
k = kP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
new file mode 100644
index 00000000..2a60ef27
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
@@ -0,0 +1,155 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Provides the k-means algorithm, alternating between MacQueen-style
+ * incremental processing and Lloyd-Style batch steps.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.landmark
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansHybridLloydMacQueen.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ */
+ public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ @Override
+ public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
+ if (relation.size() <= 0) {
+ return new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ List<Vector> means = new ArrayList<>(k);
+ for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) {
+ means.add(nv.getColumnVector());
+ }
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for (int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) {
+ { // MacQueen
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ boolean changed = macQueenIterate(relation, means, clusters, assignment);
+ if (!changed) {
+ break;
+ }
+ }
+ { // Lloyd
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment);
+ // Stop if no cluster assignment changed.
+ if (!changed) {
+ break;
+ }
+ // Recompute means.
+ means = means(clusters, means, relation);
+ }
+ }
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
+
+ // Wrap result
+ final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
+ Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for (int i = 0; i < clusters.size(); i++) {
+ KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected KMeansHybridLloydMacQueen<V, D> makeInstance() {
+ return new KMeansHybridLloydMacQueen<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index e692293c..686e2076 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -93,15 +96,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
if (prog != null) {
prog.incrementProcessed(LOG);
}
- boolean changed = assignToNearestCluster(relation, means, clusters);
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment);
// Stop if no cluster assignment changed.
if (!changed) {
break;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index bb689bd3..a0f4bb3f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -95,11 +98,9 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
// Initialize cluster and assign objects
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
- assignToNearestCluster(relation, means, clusters);
- // Initial recomputation of the means.
- means = means(clusters, means, relation);
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
// Refine result
@@ -107,7 +108,7 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
if (prog != null) {
prog.incrementProcessed(LOG);
}
- boolean changed = macQueenIterate(relation, means, clusters);
+ boolean changed = macQueenIterate(relation, means, clusters, assignment);
if (!changed) {
break;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
index 302ca86b..6fc514eb 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -84,8 +84,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
List<V> means = new ArrayList<>(k);
- Random random = rnd.getRandom();
- DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter());
+ Random random = rnd.getSingleThreadedRandom();
+ DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, random).iter());
means.add(relation.get(first));
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
@@ -134,8 +134,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
- Random random = rnd.getRandom();
- DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, new Random(random.nextLong())).iter());
+ Random random = rnd.getSingleThreadedRandom();
+ DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random).iter());
means.add(first);
ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
index cc7aaa9e..0a97c4d3 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.MeanModel;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -88,15 +91,16 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
if (prog != null) {
prog.incrementProcessed(LOG);
}
- boolean changed = assignToNearestCluster(relation, medians, clusters);
+ boolean changed = assignToNearestCluster(relation, medians, clusters, assignment);
// Stop if no cluster assignment changed.
if (!changed) {
break;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
index 87a0c7ae..41cca225 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.Mean;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -119,7 +118,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
@@ -127,7 +126,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
Mean[] mdists = Mean.newArray(k);
@@ -139,47 +138,47 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null;
// Swap phase
boolean changed = true;
- while (changed) {
- if (prog != null) {
+ while(changed) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
- for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
DBID best = null;
Mean bestm = mdists[i];
- for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
- if (DBIDUtil.equal(miter, iter)) {
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(DBIDUtil.equal(miter, iter)) {
continue;
}
Mean mdist = new Mean();
- for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
mdist.put(distQ.distance(iter, iter2).doubleValue());
}
- if (mdist.getMean() < bestm.getMean()) {
+ if(mdist.getMean() < bestm.getMean()) {
best = DBIDUtil.deref(iter);
bestm = mdist;
}
}
- if (best != null && !DBIDUtil.equal(miter, best)) {
+ if(best != null && !DBIDUtil.equal(miter, best)) {
changed = true;
medoids.set(i, best);
mdists[i] = bestm;
}
}
// Reassign
- if (changed) {
+ if(changed) {
assignToNearestCluster(medoids, mdists, clusters, distQ);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.setCompleted(LOG);
}
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < clusters.size(); i++) {
MedoidModel model = new MedoidModel(medoids.get(i));
result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
@@ -200,27 +199,27 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
boolean changed = false;
double[] dists = new double[k];
- for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
int minIndex = 0;
double mindist = Double.POSITIVE_INFINITY;
{
int i = 0;
- for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
dists[i] = distQ.distance(iditer, miter).doubleValue();
- if (dists[i] < mindist) {
+ if(dists[i] < mindist) {
minIndex = i;
mindist = dists[i];
}
}
}
- if (clusters.get(minIndex).add(iditer)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
mdist[minIndex].put(mindist);
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
mdist[minIndex].put(dists[i], -1);
break;
}
@@ -259,19 +258,19 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
index 1feda867..c9e1dc47 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -53,8 +53,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -124,7 +123,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
@@ -133,7 +132,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
@@ -145,8 +144,8 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
// Swap phase
boolean changed = true;
- while (changed) {
- if (prog != null) {
+ while(changed) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
changed = false;
@@ -155,57 +154,60 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
DBID bestid = null;
int bestcluster = -1;
int i = 0;
- for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
- for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
- if (DBIDUtil.equal(miter, iter)) {
+ for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(DBIDUtil.equal(miter, iter)) {
continue;
}
// double disti = distQ.distance(id, med).doubleValue();
double cost = 0;
DBIDIter olditer = medoids.iter();
- for (int j = 0; j < k; j++, olditer.advance()) {
- for (DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
+ for(int j = 0; j < k; j++, olditer.advance()) {
+ for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
double distcur = distQ.distance(iter2, olditer).doubleValue();
double distnew = distQ.distance(iter2, iter).doubleValue();
- if (j == i) {
+ if(j == i) {
// Cases 1 and 2.
double distsec = second.doubleValue(iter2);
- if (distcur > distsec) {
+ if(distcur > distsec) {
// Case 1, other would switch to a third medoid
cost += distsec - distcur; // Always positive!
- } else { // Would remain with the candidate
+ }
+ else { // Would remain with the candidate
cost += distnew - distcur; // Could be negative
}
- } else {
+ }
+ else {
// Cases 3-4: objects from other clusters
- if (distcur < distnew) {
+ if(distcur < distnew) {
// Case 3: no change
- } else {
+ }
+ else {
// Case 4: would switch to new medoid
cost += distnew - distcur; // Always negative
}
}
}
}
- if (cost < best) {
+ if(cost < best) {
best = cost;
bestid = DBIDUtil.deref(iter);
bestcluster = i;
}
}
}
- if (prog != null) {
+ if(prog != null) {
prog.setCompleted(LOG);
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Best cost: " + best);
}
- if (bestid != null) {
+ if(bestid != null) {
changed = true;
medoids.set(bestcluster, bestid);
}
// Reassign
- if (changed) {
+ if(changed) {
// TODO: can we save some of these recomputations?
assignToNearestCluster(medoids, ids, second, clusters, distQ);
}
@@ -213,7 +215,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < clusters.size(); i++) {
MedoidModel model = new MedoidModel(medoids.get(i));
result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
@@ -234,30 +236,31 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
boolean changed = false;
- for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
int minIndex = 0;
double mindist = Double.POSITIVE_INFINITY;
double mindist2 = Double.POSITIVE_INFINITY;
{
int i = 0;
- for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
double dist = distQ.distance(iditer, miter).doubleValue();
- if (dist < mindist) {
+ if(dist < mindist) {
minIndex = i;
mindist2 = mindist;
mindist = dist;
- } else if (dist < mindist2) {
+ }
+ else if(dist < mindist2) {
mindist2 = dist;
}
}
}
- if (clusters.get(minIndex).add(iditer)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
break;
}
}
@@ -296,19 +299,19 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
index ee90e0dc..1329132e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -60,7 +60,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
List<V> means = new ArrayList<>(k);
- final Random random = rnd.getRandom();
+ final Random random = rnd.getSingleThreadedRandom();
for(int i = 0; i < k; i++) {
double[] r = MathUtil.randomDoubleArray(dim, random);
// Rescale
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
index 9f0a1923..79013364 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
@@ -93,7 +93,7 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv);
List<V> means = new ArrayList<>();
for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) {
- means.add((V) cluster.getModel().getMean());
+ means.add(cluster.getModel().getMean());
}
return means;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
index ed9a528d..1be19bd1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
@@ -1,4 +1,27 @@
/**
* Quality measures for k-Means results.
*/
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java
new file mode 100644
index 00000000..55114f7d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java
@@ -0,0 +1,384 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction;
+import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Cluster one-dimensional data by splitting the data set on local minima after
+ * performing kernel density estimation.
+ *
+ * @author Erich Schubert
+ */
+public class KNNKernelDensityMinimaClustering<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<ClusterModel>> implements ClusteringAlgorithm<Clustering<ClusterModel>> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(KNNKernelDensityMinimaClustering.class);
+
+ /**
+ * Estimation mode.
+ *
+ * @apiviz.exclude
+ */
+ public static enum Mode {
+ BALLOON, // Balloon estimator
+ SAMPLE, // Sample-point estimator
+ }
+
+ /**
+ * Dimension to use for clustering.
+ */
+ protected int dim;
+
+ /**
+ * Kernel density function.
+ */
+ protected KernelDensityFunction kernel;
+
+ /**
+ * Estimation modes.
+ */
+ protected Mode mode;
+
+ /**
+ * Number of neighbors to use for bandwidth.
+ */
+ protected int k;
+
+ /**
+ * Window width, for local minima criterions.
+ */
+ protected int minwindow;
+
+ /**
+ * Constructor.
+ *
+ * @param dim Dimension to use for clustering
+ * @param kernel Kernel function
+ * @param mode Bandwidth mode
+ * @param k Number of neighbors
+ * @param minwindow Window size for comparison
+ */
+ public KNNKernelDensityMinimaClustering(int dim, KernelDensityFunction kernel, Mode mode, int k, int minwindow) {
+ super();
+ this.dim = dim;
+ this.kernel = kernel;
+ this.mode = mode;
+ this.k = k;
+ this.minwindow = minwindow;
+ }
+
+ /**
+ * Run the clustering algorithm on a data relation.
+ *
+ * @param relation Relation
+ * @return Clustering result
+ */
+ public Clustering<ClusterModel> run(Relation<V> relation) {
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ final int size = ids.size();
+
+ // Sort by the sole dimension
+ ids.sort(new VectorUtil.SortDBIDsBySingleDimension(relation, dim));
+
+ // Density storage.
+ WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
+ DBIDArrayIter iter = ids.iter(), iter2 = ids.iter();
+
+ StepProgress sprog = LOG.isVerbose() ? new StepProgress("Clustering steps", 2) : null;
+
+ if(sprog != null) {
+ sprog.beginStep(1, "Kernel density estimation.", LOG);
+ }
+ {
+ double[] scratch = new double[2 * k];
+ iter.seek(0);
+ for(int i = 0; i < size; i++, iter.advance()) {
+ // Current value.
+ final double curv = relation.get(iter).doubleValue(dim);
+
+ final int pre = Math.max(i - k, 0), prek = i - pre;
+ final int pos = Math.min(i + k, size - 1), posk = pos - i;
+ iter2.seek(pre);
+ for(int j = 0; j < prek; j++, iter2.advance()) {
+ scratch[j] = curv - relation.get(iter2).doubleValue(dim);
+ }
+ assert (iter2.getOffset() == i);
+ iter2.advance();
+ for(int j = 0; j < posk; j++, iter2.advance()) {
+ scratch[prek + j] = relation.get(iter2).doubleValue(dim) - curv;
+ }
+
+ assert (prek + posk >= k);
+ double kdist = QuickSelect.quickSelect(scratch, 0, prek + posk, k);
+ switch(mode){
+ case BALLOON: {
+ double dens = 0.;
+ if(kdist > 0.) {
+ for(int j = 0; j < prek + posk; j++) {
+ dens += kernel.density(scratch[j] / kdist);
+ }
+ }
+ else {
+ dens = Double.POSITIVE_INFINITY;
+ }
+ assert (iter.getOffset() == i);
+ density.putDouble(iter, dens);
+ break;
+ }
+ case SAMPLE: {
+ if(kdist > 0.) {
+ iter2.seek(pre);
+ for(int j = 0; j < prek; j++, iter2.advance()) {
+ double delta = curv - relation.get(iter2).doubleValue(dim);
+ density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
+ }
+ assert (iter2.getOffset() == i);
+ iter2.advance();
+ for(int j = 0; j < posk; j++, iter2.advance()) {
+ double delta = relation.get(iter2).doubleValue(dim) - curv;
+ density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
+ }
+ }
+ else {
+ iter2.seek(pre);
+ for(int j = 0; j < prek; j++, iter2.advance()) {
+ double delta = curv - relation.get(iter2).doubleValue(dim);
+ if(!(delta > 0.)) {
+ density.putDouble(iter2, Double.POSITIVE_INFINITY);
+ }
+ }
+ assert (iter2.getOffset() == i);
+ iter2.advance();
+ for(int j = 0; j < posk; j++, iter2.advance()) {
+ double delta = relation.get(iter2).doubleValue(dim) - curv;
+ if(!(delta > 0.)) {
+ density.putDouble(iter2, Double.POSITIVE_INFINITY);
+ }
+ }
+ }
+ break;
+ }
+ default:
+ throw new UnsupportedOperationException("Unknown mode specified.");
+ }
+ }
+ }
+
+ if(sprog != null) {
+ sprog.beginStep(2, "Local minima detection.", LOG);
+ }
+ Clustering<ClusterModel> clustering = new Clustering<>("onedimensional-kde-clustering", "One-Dimensional clustering using kernel density estimation.");
+ {
+ double[] scratch = new double[2 * minwindow + 1];
+ int begin = 0;
+ int halfw = (minwindow + 1) >> 1;
+ iter.seek(0);
+ // Fill initial buffer.
+ for(int i = 0; i < size; i++, iter.advance()) {
+ final int m = i % scratch.length, t = (i - minwindow - 1) % scratch.length;
+ scratch[m] = density.doubleValue(iter);
+ if(i > scratch.length) {
+ double min = Double.POSITIVE_INFINITY;
+ for(int j = 0; j < scratch.length; j++) {
+ if(j != t && scratch[j] < min) {
+ min = scratch[j];
+ }
+ }
+ // Local minimum:
+ if(scratch[t] < min) {
+ int end = i - minwindow + 1;
+ { // Test on which side the kNN is
+ iter2.seek(end);
+ double curv = relation.get(iter2).doubleValue(dim);
+ iter2.seek(end - halfw);
+ double left = relation.get(iter2).doubleValue(dim) - curv;
+ iter2.seek(end + halfw);
+ double right = curv - relation.get(iter2).doubleValue(dim);
+ if(left < right) {
+ end++;
+ }
+ }
+ iter2.seek(begin);
+ ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
+ for(int j = 0; j < end - begin; j++, iter2.advance()) {
+ cids.add(iter2);
+ }
+ clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
+ begin = end;
+ }
+ }
+ }
+ // Extract last cluster
+ int end = size;
+ iter2.seek(begin);
+ ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
+ for(int j = 0; j < end - begin; j++, iter2.advance()) {
+ cids.add(iter2);
+ }
+ clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
+ }
+
+ if(sprog != null) {
+ sprog.setCompleted(LOG);
+ }
+ return clustering;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(new VectorFieldTypeInformation<>(NumberVector.class, dim + 1, Integer.MAX_VALUE));
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Dimension to use for clustering.
+ */
+ public static final OptionID DIM_ID = new OptionID("kernelcluster.dim", "Dimension to use for clustering. For one-dimensional data, use 0.");
+
+ /**
+ * Kernel function.
+ */
+ public static final OptionID KERNEL_ID = new OptionID("kernelcluster.kernel", "Kernel function for density estimation.");
+
+ /**
+ * KDE mode.
+ */
+ public static final OptionID MODE_ID = new OptionID("kernelcluster.mode", "Kernel density estimation mode (baloon estimator vs. sample point estimator).");
+
+ /**
+ * Number of neighbors for bandwidth estimation.
+ */
+ public static final OptionID K_ID = new OptionID("kernelcluster.knn", "Number of nearest neighbors to use for bandwidth estimation.");
+
+ /**
+ * Half window width to find local minima.
+ */
+ public static final OptionID WINDOW_ID = new OptionID("kernelcluster.window", "Half width of sliding window to find local minima.");
+
+ /**
+ * Dimension to use for clustering.
+ */
+ protected int dim;
+
+ /**
+ * Kernel density function.
+ */
+ protected KernelDensityFunction kernel;
+
+ /**
+ * Estimation modes.
+ */
+ protected Mode mode;
+
+ /**
+ * Number of neighbors to use for bandwidth.
+ */
+ protected int k;
+
+ /**
+ * Window width, for local minima criterions.
+ */
+ protected int minwindow;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter dimP = new IntParameter(DIM_ID, 0);
+ dimP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(dimP)) {
+ dim = dimP.intValue();
+ }
+
+ ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class);
+ if(config.grab(kernelP)) {
+ kernel = kernelP.instantiateClass(config);
+ }
+
+ EnumParameter<Mode> modeP = new EnumParameter<>(MODE_ID, Mode.class, Mode.BALLOON);
+ if(config.grab(modeP)) {
+ mode = modeP.getValue();
+ }
+
+ IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.intValue();
+ }
+
+ IntParameter windowP = new IntParameter(WINDOW_ID);
+ windowP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(windowP)) {
+ minwindow = windowP.intValue();
+ }
+ }
+
+ @Override
+ protected KNNKernelDensityMinimaClustering<V> makeInstance() {
+ return new KNNKernelDensityMinimaClustering<>(dim, kernel, mode, k, minwindow);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java
new file mode 100644
index 00000000..c6c55244
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Clustering algorithms for one-dimensional data.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
index db026e93..617d74cd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
@@ -56,8 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
@@ -594,14 +593,14 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter xsiP = new IntParameter(XSI_ID);
- xsiP.addConstraint(new GreaterConstraint(0));
+ xsiP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(xsiP)) {
xsi = xsiP.intValue();
}
DoubleParameter tauP = new DoubleParameter(TAU_ID);
- tauP.addConstraint(new GreaterConstraint(0));
- tauP.addConstraint(new LessConstraint(1));
+ tauP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ tauP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
if(config.grab(tauP)) {
tau = tauP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
new file mode 100644
index 00000000..5f798a66
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
@@ -0,0 +1,605 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceMaximumDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * <p>
+ * Provides the DOC algorithm, and it's heuristic variant, FastDOC. DOC is a
+ * sampling based subspace clustering algorithm.
+ * </p>
+ *
+ * <p>
+ * Reference: <br/>
+ * C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali<br />
+ * A Monte Carlo algorithm for fast projective clustering. <br/>
+ * In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02).
+ * </p>
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.has SubspaceModel
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("DOC: Density-based Optimal projective Clustering")
+@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", title = "A Monte Carlo algorithm for fast projective clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", url = "http://dx.doi.org/10.1145/564691.564739")
+public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(DOC.class);
+
+ /**
+ * Relative density threshold parameter alpha.
+ */
+ private double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ private double beta;
+
+ /**
+ * Half width parameter.
+ */
+ private double w;
+
+ /**
+ * Holds the value of {@link Parameterizer#HEURISTICS_ID}.
+ */
+ private boolean heuristics;
+
+ /**
+ * Holds the value of {@link Parameterizer#D_ZERO_ID}.
+ */
+ private int d_zero;
+
+ /**
+ * Randomizer used internally for sampling points.
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha &alpha; relative density threshold.
+ * @param beta &beta; balancing parameter for size vs. dimensionality.
+ * @param w <em>w</em> half width parameter.
+ * @param heuristics whether to use heuristics (FastDOC) or not.
+ * @param random Random factory
+ */
+ public DOC(double alpha, double beta, double w, boolean heuristics, int d_zero, RandomFactory random) {
+ this.alpha = alpha;
+ this.beta = beta;
+ this.w = w;
+ this.heuristics = heuristics;
+ this.d_zero = d_zero;
+ this.rnd = random;
+ }
+
+ /**
+ * Performs the DOC or FastDOC (as configured) algorithm on the given
+ * Database.
+ *
+ * <p>
+ * This will run exhaustively, i.e. run DOC until no clusters are found
+ * anymore / the database size has shrunk below the threshold for minimum
+ * cluster size.
+ * </p>
+ *
+ * @param database Database
+ * @param relation Data relation
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ // Dimensionality of our set.
+ final int d = RelationUtil.dimensionality(relation);
+
+ // Get available DBIDs as a set we can remove items from.
+ ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
+
+ // Precompute values as described in Figure 2.
+ double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
+ // Outer loop count.
+ int n = (int) (2. / alpha);
+ // Inner loop count.
+ int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
+ if(heuristics) {
+ m = Math.min(m, Math.min(1000000, d * d));
+ }
+
+ // Minimum size for a cluster for it to be accepted.
+ int minClusterSize = (int) (alpha * S.size());
+
+ // List of all clusters we found.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("DOC Clusters", "DOC");
+
+ // Inform the user about the number of actual clusters found so far.
+ IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
+
+ // To not only find a single cluster, we continue running until our set
+ // of points is empty.
+ while(S.size() > minClusterSize) {
+ Cluster<SubspaceModel<V>> C;
+ if(heuristics) {
+ C = runFastDOC(relation, S, d, n, m, (int) r);
+ }
+ else {
+ C = runDOC(relation, S, d, n, m, (int) r, minClusterSize);
+ }
+
+ if(C == null) {
+ // Stop trying if we couldn't find a cluster.
+ break;
+ }
+ // Found a cluster, remember it, remove its points from the set.
+ result.addToplevelCluster(C);
+
+ // Remove all points of the cluster from the set and continue.
+ S.removeDBIDs(C.getIDs());
+
+ if(cprogress != null) {
+ cprogress.setProcessed(result.getAllClusters().size(), LOG);
+ }
+ }
+
+ // Add the remainder as noise.
+ if(S.size() > 0) {
+ BitSet alldims = new BitSet();
+ alldims.set(0, d);
+ result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel<>(new Subspace(alldims), Centroid.make(relation, S).toVector(relation))));
+ }
+
+ if(cprogress != null) {
+ cprogress.setCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Performs a single run of DOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @param minClusterSize Minimum size a cluster must have to be accepted.
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
+ final DoubleDistance wd = new DoubleDistance(w);
+ // Best cluster for the current run.
+ DBIDs C = null;
+ // Relevant attributes for the best cluster.
+ BitSet D = null;
+ // Quality of the best cluster.
+ double quality = Double.NEGATIVE_INFINITY;
+
+ // Bounds for our cluster.
+ // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new
+ // double[d], new double[d]);
+
+ // Weights for distance (= rectangle query)
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(new BitSet(d));
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq);
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+ DBIDArrayIter iter = S.iter();
+
+ for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension and build bounding box.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+ if(nD.cardinality() > 0) {
+ // Get all points in the box.
+ df.setSelectedDimensions(nD);
+ // TODO: add filtering capabilities into query API!
+ DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, wd));
+
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + nD.cardinality());
+ }
+
+ // Is the cluster large enough?
+ if(nC.size() < minClusterSize) {
+ // Too small.
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but it's too small.");
+ }
+ }
+ else {
+ // Better cluster than before?
+ double nQuality = computeClusterQuality(nC.size(), nD.cardinality());
+ if(nQuality > quality) {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
+ }
+ C = nC;
+ D = nD;
+ quality = nQuality;
+ }
+ else {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but we already have a better one.");
+ }
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ if(C != null) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Performs a single run of FastDOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
+ // Relevant attributes of highest cardinality.
+ BitSet D = null;
+ // The seed point for the best dimensions.
+ DBIDVar dV = DBIDUtil.newVar();
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+
+ DBIDArrayIter iter = S.iter();
+ outer: for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+
+ if(D == null || nD.cardinality() > D.cardinality()) {
+ D = nD;
+ dV.set(iter);
+
+ if(D.cardinality() >= d_zero) {
+ if(iprogress != null) {
+ iprogress.setProcessed(iprogress.getTotal(), LOG);
+ }
+ break outer;
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ // If no relevant dimensions were found, skip it.
+ if(D == null || D.cardinality() == 0) {
+ return null;
+ }
+
+ // Get all points in the box.
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
+
+ // TODO: add filtering capabilities into query API!
+ DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, new DoubleDistance(w)));
+
+ // If we have a non-empty cluster, return it.
+ if(C.size() > 0) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Utility method to test if a given dimension is relevant as determined via a
+ * set of reference points (i.e. if the variance along the attribute is lower
+ * than the threshold).
+ *
+ * @param dimension the dimension to test.
+ * @param relation used to get actual values for DBIDs.
+ * @param points the points to test.
+ * @return <code>true</code> if the dimension is relevant.
+ */
+ private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
+ double min = Double.POSITIVE_INFINITY;
+ double max = Double.NEGATIVE_INFINITY;
+ for(DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
+ V xV = relation.get(iter);
+ min = Math.min(min, xV.doubleValue(dimension));
+ max = Math.max(max, xV.doubleValue(dimension));
+ if(max - min > w) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Utility method to create a subspace cluster from a list of DBIDs and the
+ * relevant attributes.
+ *
+ * @param relation to compute a centroid.
+ * @param C the cluster points.
+ * @param D the relevant dimensions.
+ * @return an object representing the subspace cluster.
+ */
+ private Cluster<SubspaceModel<V>> makeCluster(Relation<V> relation, DBIDs C, BitSet D) {
+ DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
+ Cluster<SubspaceModel<V>> cluster = new Cluster<>(ids);
+ cluster.setModel(new SubspaceModel<>(new Subspace(D), Centroid.make(relation, ids).toVector(relation)));
+ return cluster;
+ }
+
+ /**
+ * Computes the quality of a cluster based on its size and number of relevant
+ * attributes, as described via the &mu;-function from the paper.
+ *
+ * @param clusterSize the size of the cluster.
+ * @param numRelevantDimensions the number of dimensions relevant to the
+ * cluster.
+ * @return a quality measure (only use this to compare the quality to that
+ * other clusters).
+ */
+ private double computeClusterQuality(int clusterSize, int numRelevantDimensions) {
+ return clusterSize * Math.pow(1. / beta, numRelevantDimensions);
+ }
+
+ // ---------------------------------------------------------------------- //
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("doc.alpha", "Minimum relative density for a set of points to be considered a cluster (|C|>=doc.alpha*|S|).");
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ public static final OptionID BETA_ID = new OptionID("doc.beta", "Preference of cluster size versus number of relevant dimensions (higher value means higher priority on larger clusters).");
+
+ /**
+ * Half width parameter.
+ */
+ public static final OptionID W_ID = new OptionID("doc.w", "Maximum extent of scattering of points along a single attribute for the attribute to be considered relevant.");
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ public static final OptionID HEURISTICS_ID = new OptionID("doc.fastdoc", "Use heuristics as described, thus using the FastDOC algorithm (not yet implemented).");
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ public static final OptionID D_ZERO_ID = new OptionID("doc.d0", "Parameter for FastDOC, setting the number of relevant attributes which, when found for a cluster, are deemed enough to stop iterating.");
+
+ /**
+ * Random seeding parameter.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("doc.random-seed", "Random seed, for reproducible experiments.");
+
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ protected double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ protected double beta;
+
+ /**
+ * Half width parameter.
+ */
+ protected double w;
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ protected boolean heuristics;
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ protected int d_zero;
+
+ /**
+ * Random seeding factory.
+ */
+ protected RandomFactory random = RandomFactory.DEFAULT;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(BETA_ID, 0.8);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(param)) {
+ beta = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(W_ID, 0.05);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ w = param.getValue();
+ }
+ }
+
+ {
+ Flag param = new Flag(HEURISTICS_ID);
+ if(config.grab(param)) {
+ heuristics = param.getValue();
+ }
+ }
+
+ if(heuristics) {
+ IntParameter param = new IntParameter(D_ZERO_ID, 5);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ d_zero = param.getValue();
+ }
+ }
+
+ {
+ RandomParameter param = new RandomParameter(RANDOM_ID);
+ if(config.grab(param)) {
+ random = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected DOC<V> makeInstance() {
+ return new DOC<>(alpha, beta, w, heuristics, d_zero, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
index b17ebebb..cd5e51b8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
@@ -69,8 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -170,12 +169,12 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
*/
public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
// Instantiate DiSH distance (and thus run the preprocessor)
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("*** Run DiSH preprocessor.");
}
DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation);
// Configure and run OPTICS.
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("*** Run OPTICS algorithm.");
}
ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters);
@@ -186,7 +185,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
optics = opticsconfig.tryInstantiate(cls);
ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("*** Compute Clusters.");
}
return computeClusters(relation, opticsResult, dishDistanceQuery);
@@ -206,10 +205,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// extract clusters
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 1: extract clusters");
- for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
+ for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
}
}
@@ -218,10 +217,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// check if there are clusters < minpts
checkClusters(database, distFunc, clustersMap, minpts);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 2: check clusters");
- for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
+ for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
}
}
@@ -230,9 +229,9 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// sort the clusters
List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 3: sort clusters");
- for (Cluster<SubspaceModel<V>> c : clusters) {
+ for(Cluster<SubspaceModel<V>> c : clusters) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size());
}
LOG.verbose(msg.toString());
@@ -241,14 +240,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// build the hierarchy
Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering");
buildHierarchy(database, distFunc, clustering, clusters, dimensionality);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 4: build hierarchy");
- for (Cluster<SubspaceModel<V>> c : clusters) {
+ for(Cluster<SubspaceModel<V>> c : clusters) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size());
- for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
- for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
@@ -256,8 +255,8 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
// build result
- for (Cluster<SubspaceModel<V>> c : clusters) {
- if (clustering.getClusterHierarchy().numParents(c) == 0) {
+ for(Cluster<SubspaceModel<V>> c : clusters) {
+ if(clustering.getClusterHierarchy().numParents(c) == 0) {
clustering.addToplevelCluster(c);
}
}
@@ -278,7 +277,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>();
Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>();
Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>();
- for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
+ for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next();
entryMap.put(entry.getID(), entry);
@@ -287,43 +286,43 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// get the list of (parallel) clusters for the preference vector
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector);
- if (parallelClusters == null) {
+ if(parallelClusters == null) {
parallelClusters = new ArrayList<>();
clustersMap.put(preferenceVector, parallelClusters);
}
// look for the proper cluster
Pair<BitSet, ArrayModifiableDBIDs> cluster = null;
- for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database);
PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector);
- if (dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
+ if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector());
- if (d <= 2 * epsilon) {
+ if(d <= 2 * epsilon) {
cluster = c;
break;
}
}
}
- if (cluster == null) {
+ if(cluster == null) {
cluster = new Pair<>(preferenceVector, DBIDUtil.newArray());
parallelClusters.add(cluster);
}
cluster.second.add(entry.getID());
entryToClusterMap.put(entry.getID(), cluster);
- if (progress != null) {
+ if(progress != null) {
progress.setProcessed(++processed, LOG);
}
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
- if (LOG.isDebuggingFiner()) {
+ if(LOG.isDebuggingFiner()) {
StringBuilder msg = new StringBuilder("Step 0");
- for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
+ for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size());
}
}
@@ -331,24 +330,24 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
// add the predecessor to the cluster
- for (BitSet pv : clustersMap.keySet()) {
+ for(BitSet pv : clustersMap.keySet()) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for (Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
- if (cluster.second.isEmpty()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
+ if(cluster.second.isEmpty()) {
continue;
}
DBID firstID = cluster.second.get(0);
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID);
DBID predecessorID = entry.getPredecessorID();
- if (predecessorID == null) {
+ if(predecessorID == null) {
continue;
}
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID);
// parallel cluster
- if (predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
+ if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
continue;
}
- if (predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
+ if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
continue;
}
@@ -375,16 +374,17 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
final int db_dim = RelationUtil.dimensionality(database);
// int num = 1;
List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>();
- for (BitSet pv : clustersMap.keySet()) {
+ for(BitSet pv : clustersMap.keySet()) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for (int i = 0; i < parallelClusters.size(); i++) {
+ for(int i = 0; i < parallelClusters.size(); i++) {
Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i);
Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second);
cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database)));
String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, "");
- if (parallelClusters.size() > 1) {
+ if(parallelClusters.size() > 1) {
cluster.setName("Cluster_" + subspace + "_" + i);
- } else {
+ }
+ else {
cluster.setName("Cluster_" + subspace);
}
clusters.add(cluster);
@@ -417,11 +417,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>();
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>();
Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray());
- for (BitSet pv : clustersMap.keySet()) {
+ for(BitSet pv : clustersMap.keySet()) {
// noise
- if (pv.cardinality() == 0) {
+ if(pv.cardinality() == 0) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
noise.second.addDBIDs(c.second);
}
}
@@ -429,10 +429,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
else {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size());
- for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
- if (!pv.equals(new BitSet()) && c.second.size() < minpts) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
+ if(!pv.equals(new BitSet()) && c.second.size() < minpts) {
notAssigned.add(c);
- } else {
+ }
+ else {
newParallelClusters.add(c);
}
}
@@ -443,14 +444,15 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
clustersMap.clear();
clustersMap.putAll(newClustersMap);
- for (Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
- if (c.second.isEmpty()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
+ if(c.second.isEmpty()) {
continue;
}
Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap);
- if (parent != null) {
+ if(parent != null) {
parent.second.addDBIDs(c.second);
- } else {
+ }
+ else {
noise.second.addDBIDs(c.second);
}
}
@@ -477,23 +479,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
BitSet childPV = child.first;
int childCardinality = childPV.cardinality();
- for (BitSet parentPV : clustersMap.keySet()) {
+ for(BitSet parentPV : clustersMap.keySet()) {
int parentCardinality = parentPV.cardinality();
- if (parentCardinality >= childCardinality) {
+ if(parentCardinality >= childCardinality) {
continue;
}
- if (resultCardinality != -1 && parentCardinality <= resultCardinality) {
+ if(resultCardinality != -1 && parentCardinality <= resultCardinality) {
continue;
}
BitSet pv = (BitSet) childPV.clone();
pv.and(parentPV);
- if (pv.equals(parentPV)) {
+ if(pv.equals(parentPV)) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV);
- for (Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database);
double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV);
- if (d <= 2 * epsilon) {
+ if(d <= 2 * epsilon) {
result = parent;
resultCardinality = parentCardinality;
break;
@@ -519,57 +521,59 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
final int db_dim = RelationUtil.dimensionality(database);
Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy();
- for (int i = 0; i < clusters.size() - 1; i++) {
+ for(int i = 0; i < clusters.size() - 1; i++) {
Cluster<SubspaceModel<V>> c_i = clusters.get(i);
int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality();
V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database);
- for (int j = i + 1; j < clusters.size(); j++) {
+ for(int j = i + 1; j < clusters.size(); j++) {
Cluster<SubspaceModel<V>> c_j = clusters.get(j);
int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality();
- if (subspaceDim_i < subspaceDim_j) {
- if (LOG.isDebugging()) {
+ if(subspaceDim_i < subspaceDim_j) {
+ if(LOG.isDebugging()) {
msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']');
msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']');
}
// noise level reached
- if (c_j.getModel().getSubspace().dimensionality() == 0) {
+ if(c_j.getModel().getSubspace().dimensionality() == 0) {
// no parents exists -> parent is noise
- if (hier.numParents(c_i) == 0) {
+ if(hier.numParents(c_i) == 0) {
clustering.addChildCluster(c_j, c_i);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
msg.append(']');
}
}
- } else {
+ }
+ else {
V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database);
PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions());
double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector());
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
msg.append("\n dist = ").append(distance.getCorrelationValue());
}
- if (distance.getCorrelationValue() == subspaceDim_j) {
- if (LOG.isDebugging()) {
+ if(distance.getCorrelationValue() == subspaceDim_j) {
+ if(LOG.isDebugging()) {
msg.append("\n d = ").append(d);
}
- if (d <= 2 * epsilon) {
+ if(d <= 2 * epsilon) {
// no parent exists or c_j is not a parent of the already
// existing parents
- if (hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
+ if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
clustering.addChildCluster(c_j, c_i);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
msg.append("] is parent of [");
msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
msg.append(']');
}
}
- } else {
+ }
+ else {
throw new RuntimeException("Should never happen: d = " + d);
}
}
@@ -577,7 +581,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug(msg.toString());
}
}
@@ -599,11 +603,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
int dimensionality = RelationUtil.dimensionality(database);
int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality();
- for (; iter.valid(); iter.advance()) {
+ for(; iter.valid(); iter.advance()) {
Cluster<SubspaceModel<V>> child = iter.get();
V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database);
PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions());
- if (distance.getCorrelationValue() == subspaceDim_parent) {
+ if(distance.getCorrelationValue() == subspaceDim_parent) {
return true;
}
}
@@ -642,14 +646,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
super.makeOptions(config);
DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, 0.001);
- epsilonP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(epsilonP)) {
+ epsilonP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(epsilonP)) {
epsilon = epsilonP.doubleValue();
}
IntParameter muP = new IntParameter(MU_ID, 1);
- muP.addConstraint(new GreaterConstraint(0));
- if (config.grab(muP)) {
+ muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(muP)) {
mu = muP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
index 9ac7c072..3f135564 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
@@ -34,8 +34,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -95,8 +94,8 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0));
- alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = 0.0;
if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
new file mode 100644
index 00000000..9d1ee94d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
@@ -0,0 +1,1000 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.PoissonDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * P3C: A Robust Projected Clustering Algorithm.
+ *
+ * <p>
+ * Reference: <br/>
+ * Gabriela Moise, Jörg Sander, Martin Ester<br />
+ * P3C: A Robust Projected Clustering Algorithm.<br/>
+ * In: Proc. Sixth International Conference on Data Mining (ICDM '06)
+ * </p>
+ *
+ * This is not a complete implementation of P3C, but good enough for most users.
+ * Improvements are welcome. The most obviously missing step is section 3.5 of
+ * P3C, where the cluster subspaces are refined.
+ *
+ * @author Florian Nuecke
+ * @author Erich Schubert
+ *
+ * @apiviz.uses EM
+ * @apiviz.has SubspaceModel
+ * @apiviz.has ClusterCandidate
+ * @apiviz.has Signature
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("P3C: A Robust Projected Clustering Algorithm.")
+@Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
+public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(P3C.class);
+
+ /**
+ * Parameter for the Poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existing in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ /**
+ * Alpha threshold for testing.
+ */
+ protected double alpha = 0.001;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha ChiSquared test threshold
+ * @param poissonThreshold Poisson test threshold
+ * @param maxEmIterations Maximum number of EM iterations
+ * @param emDelta EM stopping threshold
+ * @param minClusterSize Minimum cluster size
+ */
+ public P3C(double alpha, double poissonThreshold, int maxEmIterations, double emDelta, int minClusterSize) {
+ super();
+ this.alpha = alpha;
+ this.poissonThreshold = poissonThreshold;
+ this.maxEmIterations = maxEmIterations;
+ this.emDelta = emDelta;
+ this.minClusterSize = minClusterSize;
+ }
+
+ /**
+ * Performs the P3C algorithm on the given Database.
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+
+ // Overall progress.
+ StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
+ }
+
+ // Desired number of bins, as per Sturge:
+ final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+
+ // Perform 1-dimensional projections, and split into bins.
+ SetDBIDs[][] partitions = partitionData(relation, binCount);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
+ }
+
+ // Set markers for each attribute until they're all deemed uniform.
+ final long[][] markers = new long[dim][];
+ int numuniform = 0;
+ for(int d = 0; d < dim; d++) {
+ final SetDBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d] = BitsUtil.zero(binCount);
+ int card = 0;
+ while(card < dim - 1) {
+ // Find bin with largest support, test only the dimensions that were not
+ // previously marked.
+ int bestBin = chiSquaredUniformTest(parts, marked, card);
+ if(bestBin < 0) {
+ numuniform++;
+ break; // Uniform
+ }
+ BitsUtil.setI(marked, bestBin);
+ card++;
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
+ }
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
+ }
+
+ ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
+ }
+
+ ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
+ }
+
+ clusterCores = pruneRedundantClusterCores(clusterCores);
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of cluster cores found: " + clusterCores.size());
+ }
+
+ if(clusterCores.size() == 0) {
+ stepProgress.setCompleted(LOG);
+ Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
+ c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+ return c;
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
+ }
+
+ // Track objects not assigned to any cluster:
+ ModifiableDBIDs noise = DBIDUtil.newHashSet();
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ int k = clusterCores.size();
+ double[] clusterWeights = new double[k];
+ computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+
+ // Initial estimate of covariances, to assign noise objects
+ Vector[] means = new Vector[k];
+ Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
+ double[] normDistrFactor = new double[k];
+ Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
+
+ double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
+ final double emOld = emNew;
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ // reassign probabilities
+ emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("iteration " + it + " - expectation value: " + emNew);
+ }
+ if((emNew - emOld) <= emDelta) {
+ break;
+ }
+ }
+
+ // Perform EM clustering.
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(6, "Generating hard clustering.", LOG);
+ }
+
+ // Create a hard clustering, making sure each data point only is part of one
+ // cluster, based on the best match from the membership matrix.
+ ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
+ }
+
+ // Outlier detection. Remove points from clusters that have a Mahalanobis
+ // distance larger than the critical value of the ChiSquare distribution.
+ findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(8, "Removing empty clusters.", LOG);
+ }
+
+ // Remove near-empty clusters.
+ for(Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext();) {
+ ClusterCandidate cand = it.next();
+ final int size = cand.ids.size();
+ if(size < minClusterSize) {
+ if(size > 0) {
+ noise.addDBIDs(cand.ids);
+ }
+ it.remove();
+ }
+ }
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
+ }
+
+ // TODO Check all attributes previously deemed uniform (section 3.5).
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(9, "Generating final result.", LOG);
+ }
+
+ // Generate final output.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+ for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
+ ClusterCandidate candidate = clusterCandidates.get(cluster);
+ CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
+ result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+ }
+ LOG.verbose("Noise size: " + noise.size());
+ if(noise.size() > 0) {
+ result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+ }
+
+ if(stepProgress != null) {
+ stepProgress.ensureCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Construct the 1-signatures by merging adjacent dense bins.
+ *
+ * @param partitions Initial partitions.
+ * @param markers Markers for dense partitions.
+ * @return 1-signatures
+ */
+ private ArrayList<Signature> constructOneSignatures(SetDBIDs[][] partitions, final long[][] markers) {
+ final int dim = partitions.length;
+ // Generate projected p-signature intervals.
+ ArrayList<Signature> signatures = new ArrayList<>();
+ for(int d = 0; d < dim; d++) {
+ final DBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d];
+ // Find sequences of 1s in marked.
+ for(int start = BitsUtil.nextSetBit(marked, 0); start >= 0;) {
+ int end = BitsUtil.nextClearBit(marked, start + 1);
+ end = (end == -1) ? dim : end;
+ int[] signature = new int[dim << 1];
+ Arrays.fill(signature, -1);
+ signature[d << 1] = start;
+ signature[(d << 1) + 1] = end - 1; // inclusive
+ HashSetModifiableDBIDs sids = unionDBIDs(parts, start, end /* exclusive */);
+ if(LOG.isDebugging()) {
+ LOG.debug("1-signature: " + d + " " + start + "-" + (end - 1));
+ }
+ signatures.add(new Signature(signature, sids));
+ start = (end < dim) ? BitsUtil.nextSetBit(marked, end + 1) : -1;
+ }
+ }
+ return signatures;
+ }
+
+ /**
+ * Merge 1-signatures into p-signatures.
+ *
+ * @param binCount Number of bins in each dimension.
+ * @param signatures 1-signatures
+ * @return p-signatures
+ */
+ private ArrayList<Signature> mergeClusterCores(final int binCount, ArrayList<Signature> signatures) {
+ MutableProgress mergeProgress = LOG.isVerbose() ? new MutableProgress("Merging signatures.", signatures.size(), LOG) : null;
+
+ // Annotate dimensions to 1-signatures for quick stopping.
+ int[] firstdim = new int[signatures.size()];
+ for(int i = 0; i < signatures.size(); i++) {
+ firstdim[i] = signatures.get(i).getFirstDim();
+ }
+ LOG.debug("First dimensions: " + FormatUtil.format(firstdim));
+
+ // Merge to (p+1)-signatures (cluster cores).
+ ArrayList<Signature> clusterCores = new ArrayList<>(signatures);
+ // Try adding merge 1-signature with each cluster core.
+ for(int i = 0; i < clusterCores.size(); i++) {
+ final Signature parent = clusterCores.get(i);
+ final int end = parent.getFirstDim();
+ for(int j = 0; j < signatures.size() && firstdim[j] < end; j++) {
+ final Signature onesig = signatures.get(j);
+ final Signature merge = mergeSignatures(parent, onesig, binCount);
+ if(merge != null) {
+ // We add each potential core to the list to allow remaining
+ // 1-signatures to try merging with this p-signature as well.
+ clusterCores.add(merge);
+ // Flag both "parents" for removal.
+ parent.prune = true;
+ onesig.prune = true;
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setTotal(clusterCores.size());
+ mergeProgress.incrementProcessed(LOG);
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setProcessed(mergeProgress.getTotal(), LOG);
+ }
+ return clusterCores;
+ }
+
+ private ArrayList<Signature> pruneRedundantClusterCores(ArrayList<Signature> clusterCores) {
+ // Prune cluster cores based on Definition 3, Condition 2.
+ ArrayList<Signature> retain = new ArrayList<>(clusterCores.size());
+ outer: for(Signature clusterCore : clusterCores) {
+ if(clusterCore.prune) {
+ continue;
+ }
+ for(int k = 0; k < clusterCores.size(); k++) {
+ Signature other = clusterCores.get(k);
+ if(other != clusterCore) {
+ if(other.isSuperset(clusterCore)) {
+ continue outer;
+ }
+ }
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Retained cluster core: " + clusterCore);
+ }
+ retain.add(clusterCore);
+ }
+ clusterCores = retain;
+ return clusterCores;
+ }
+
+ /**
+ * Partition the data set into {@code bins} bins in each dimension
+ * <i>independently</i>.
+ *
+ * This can be used to construct a grid approximation of the data using O(d n)
+ * memory.
+ *
+ * When a dimension is found to be constant, it will not be partitioned, but
+ * instead the corresponding array will be set to {@code null}.
+ *
+ * @param relation Data relation to partition
+ * @param bins Number of bins
+ * @return Partitions of each dimension.
+ */
+ private SetDBIDs[][] partitionData(final Relation<V> relation, final int bins) {
+ final int dim = RelationUtil.dimensionality(relation);
+ SetDBIDs[][] partitions = new SetDBIDs[dim][bins];
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ DBIDArrayIter iter = ids.iter(); // will be reused.
+ SortDBIDsBySingleDimension sorter = new VectorUtil.SortDBIDsBySingleDimension(relation, 0);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d);
+ ids.sort(sorter);
+ // Minimum:
+ iter.seek(0);
+ double min = relation.get(iter).doubleValue(d);
+ // Extend:
+ iter.seek(ids.size() - 1);
+ double delta = (relation.get(iter).doubleValue(d) - min) / bins;
+ if(delta > 0.) {
+ SetDBIDs[] dimparts = partitions[d];
+ double split = min + delta;
+ HashSetModifiableDBIDs pids = DBIDUtil.newHashSet();
+ dimparts[0] = pids;
+ int i = 0;
+ for(iter.seek(0); iter.valid(); iter.advance()) {
+ final double v = relation.get(iter).doubleValue(d);
+ if(v <= split || i == dimparts.length - 1) {
+ pids.add(iter);
+ }
+ else {
+ i++;
+ split += delta;
+ pids = DBIDUtil.newHashSet();
+ dimparts[i] = pids;
+ }
+ }
+ for(++i; i < dimparts.length; ++i) {
+ dimparts[i] = pids;
+ }
+ }
+ else {
+ partitions[d] = null; // Flag whole dimension as bad
+ }
+ }
+ return partitions;
+ }
+
+ /**
+ * Compute the union of multiple DBID sets.
+ *
+ * @param parts Parts array
+ * @param start Array start index
+ * @param end Array end index (exclusive)
+ * @return
+ */
+ protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
+ int sum = 0;
+ for(int i = start; i < end; i++) {
+ sum += parts[i].size();
+ }
+ HashSetModifiableDBIDs sids = DBIDUtil.newHashSet(sum);
+ for(int i = start; i < end; i++) {
+ sids.addDBIDs(parts[i]);
+ }
+ return sids;
+ }
+
+ /**
+ * Performs a ChiSquared test to determine whether an attribute has a uniform
+ * distribution.
+ *
+ * @param parts Data partitions.
+ * @param marked the marked bins that should be ignored.
+ * @param card Cardinality
+ * @return Position of maximum, or -1 when uniform.
+ */
+ private int chiSquaredUniformTest(SetDBIDs[] parts, long[] marked, int card) {
+ // Remaining number of bins.
+ final int binCount = parts.length - card;
+ // Get global mean over all unmarked bins.
+ int max = 0, maxpos = -1;
+ MeanVariance mv = new MeanVariance();
+ for(int i = 0; i < parts.length; i++) {
+ // Ignore already marked bins.
+ if(BitsUtil.get(marked, i)) {
+ continue;
+ }
+ final int binSupport = parts[i].size();
+ mv.put(binSupport);
+ if(binSupport > max) {
+ max = binSupport;
+ maxpos = i;
+ }
+ }
+ if(mv.getCount() < 1. || !(mv.getNaiveVariance() > 0.)) {
+ return -1;
+ }
+ // ChiSquare statistic is the naive variance of the sizes!
+ final double chiSquare = mv.getNaiveVariance() / mv.getMean();
+ final double test = ChiSquaredDistribution.cdf(chiSquare, Math.max(1, binCount - card - 1));
+ if((1. - alpha) < test) {
+ return maxpos;
+ }
+ return -1;
+ }
+
+ /**
+ * Computes a fuzzy membership with the weights based on which cluster cores
+ * each data point is part of.
+ *
+ * @param relation Data relation
+ * @param clusterCores the cluster cores.
+ * @param unassigned set to which to add unassigned points.
+ * @param probClusterIGivenX Membership probabilities.
+ * @param clusterWeights Cluster weights
+ */
+ private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+ final int n = relation.size();
+ final int k = clusterCores.size();
+
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ int count = 0;
+ double[] weights = new double[k];
+ for(int cluster = 0; cluster < k; ++cluster) {
+ if(clusterCores.get(cluster).ids.contains(iter)) {
+ weights[cluster] = 1.;
+ ++count;
+ }
+ }
+
+ // Set value(s) in membership matrix.
+ if(count > 0) {
+ // Rescale.
+ VMath.timesEquals(weights, 1. / count);
+ VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+ }
+ else {
+ // Does not match any cluster, mark it.
+ unassigned.add(iter);
+ }
+ probClusterIGivenX.put(iter, weights);
+ }
+ }
+
+ /**
+ * Assign unassigned objects to best candidate based on shortest Mahalanobis
+ * distance.
+ *
+ * @param relation Data relation
+ * @param probClusterIGivenX fuzzy membership matrix.
+ * @param means Cluster means.
+ * @param invCovMatr Cluster covariance matrices.
+ * @param clusterWeights
+ * @param assigned mapping of matrix row to DBID.
+ * @param unassigned the list of points not yet assigned.
+ */
+ private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+ if(unassigned.size() == 0) {
+ return;
+ }
+ final int k = means.length;
+ double pweight = 1. / relation.size();
+
+ for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
+ // Find the best matching known cluster core using the Mahalanobis
+ // distance.
+ Vector v = relation.get(iter).getColumnVector();
+ int bestCluster = -1;
+ double minDistance = Double.POSITIVE_INFINITY;
+ for(int c = 0; c < k; ++c) {
+ final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+ if(distance < minDistance) {
+ minDistance = distance;
+ bestCluster = c;
+ }
+ }
+ // Assign to best core.
+ double[] weights = new double[k];
+ weights[bestCluster] = 1.0;
+ clusterWeights[bestCluster] += pweight;
+ probClusterIGivenX.put(iter, weights);
+ }
+
+ // Clear the list of unassigned objects.
+ unassigned.clear();
+ }
+
+ /**
+ * Creates a hard clustering from the specified soft membership matrix.
+ *
+ * @param probClusterIGivenX the membership matrix.
+ * @param dbids mapping matrix row to DBID.
+ * @return a hard clustering based on the matrix.
+ */
+ private ArrayList<ClusterCandidate> hardClustering(WritableDataStore<double[]> probClusterIGivenX, List<Signature> clusterCores, DBIDs dbids) {
+ final int k = clusterCores.size();
+
+ // Initialize cluster sets.
+ ArrayList<ClusterCandidate> candidates = new ArrayList<>();
+ for(Signature sig : clusterCores) {
+ candidates.add(new ClusterCandidate(sig));
+ }
+
+ // Perform hard partitioning, assigning each data point only to one cluster,
+ // namely that one it is most likely to belong to.
+ for(DBIDIter iter = dbids.iter(); iter.valid(); iter.advance()) {
+ final double[] probs = probClusterIGivenX.get(iter);
+ int bestCluster = 0;
+ double bestProbability = probs[0];
+ for(int c = 1; c < k; ++c) {
+ if(probs[c] > bestProbability) {
+ bestCluster = c;
+ bestProbability = probs[c];
+ }
+ }
+ candidates.get(bestCluster).ids.add(iter);
+ }
+
+ return candidates;
+ }
+
+ /**
+ * Performs outlier detection by testing the Mahalanobis distance of each
+ * point in a cluster against the critical value of the ChiSquared
+ * distribution with as many degrees of freedom as the cluster has relevant
+ * attributes.
+ *
+ * @param relation Data relation
+ * @param means Cluster means
+ * @param invCovMatr Inverse covariance matrixes
+ * @param clusterCandidates the list of clusters to check.
+ * @param nonUniformDimensionCount the number of dimensions to consider when
+ * testing.
+ * @param noise the set to which to add points deemed outliers.
+ */
+ private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
+ final int k = clusterCandidates.size();
+
+ for(int c = 0; c < k; ++c) {
+ final ClusterCandidate candidate = clusterCandidates.get(c);
+ if(candidate.ids.size() < 2) {
+ continue;
+ }
+ final int dof = candidate.dimensions.cardinality();
+ final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+ for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
+ final Vector mean = means[c];
+ final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
+ final Matrix invCov = invCovMatr[c];
+ final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+ if(distance >= threshold) {
+ // Outlier, remove it and add it to the outlier set.
+ noise.add(iter);
+ iter.remove();
+ }
+ }
+ }
+ }
+
+ /**
+ * Generates a merged signature of this and another one, where the other
+ * signature must be a 1-signature.
+ *
+ * @param first First signature.
+ * @param second Second signature, must be a 1-signature.
+ * @param numBins Number of bins per dimension.
+ * @return the merged signature, or null if the merge failed.
+ */
+ protected Signature mergeSignatures(Signature first, Signature second, int numBins) {
+ int d2 = -1;
+ for(int i = 0; i < second.spec.length; i += 2) {
+ if(second.spec[i] >= 0) {
+ assert (d2 == -1) : "Merging with non-1-signature?!?";
+ d2 = i;
+ }
+ }
+ assert (d2 >= 0) : "Merging with empty signature?";
+
+ // Avoid generating redundant signatures.
+ if(first.spec[d2] >= 0) {
+ return null;
+ }
+
+ // Definition 3, Condition 1:
+ // True support:
+ final ModifiableDBIDs intersection = DBIDUtil.intersection(first.ids, second.ids);
+ final int support = intersection.size();
+ // Interval width, computed using selected number of bins / total bins
+ double width = (second.spec[d2 + 1] - second.spec[d2] + 1.) / (double) numBins;
+ // Expected size thus:
+ double expect = first.ids.size() * width;
+ if(support <= expect || support < minClusterSize) {
+ return null;
+ }
+ final double test = PoissonDistribution.rawProbability(support, expect);
+ if((poissonThreshold) <= test) {
+ return null;
+ }
+ // Create merged signature.
+ int[] spec = first.spec.clone();
+ spec[d2] = second.spec[d2];
+ spec[d2 + 1] = second.spec[d2];
+
+ final Signature newsig = new Signature(spec, intersection);
+ if(LOG.isDebugging()) {
+ LOG.debug(newsig.toString());
+ }
+ return newsig;
+ }
+
+ /**
+ * P3C Cluster signature.
+ *
+ * @author Erich Schubert
+ */
+ private static class Signature {
+ /**
+ * Subspace specification
+ */
+ int[] spec;
+
+ /**
+ * Object ids.
+ */
+ DBIDs ids;
+
+ /**
+ * Pruning flag.
+ */
+ boolean prune = false;
+
+ /**
+ * Constructor.
+ *
+ * @param spec Subspace specification
+ * @param ids IDs.
+ */
+ private Signature(int[] spec, DBIDs ids) {
+ super();
+ this.spec = spec;
+ this.ids = ids;
+ }
+
+ /**
+ * Test whether this is a superset of the other signature.
+ *
+ * @param other Other signature.
+ * @return {@code true} when this is a superset.
+ */
+ public boolean isSuperset(Signature other) {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] != other.spec[i] || spec[i + 1] != other.spec[i]) {
+ if(other.spec[i] != -1) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Find the first dimension set in this signature.
+ *
+ * @return Dimension
+ */
+ public int getFirstDim() {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ return (i >>> 1);
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ int p = 0;
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ p++;
+ }
+ }
+ StringBuilder buf = new StringBuilder();
+ buf.append(p).append("-signature: ");
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ buf.append(i >>> 1).append(':');
+ buf.append(spec[i]).append('-').append(spec[i + 1]).append(' ');
+ }
+ }
+ buf.append(" size: ").append(ids.size());
+ return buf.toString();
+ }
+ }
+
+ /**
+ * This class is used to represent potential clusters.
+ *
+ * @author Erich Schubert
+ */
+ private static class ClusterCandidate {
+ /**
+ * Selected dimensions
+ */
+ public final BitSet dimensions;
+
+ /**
+ * Objects contained in cluster.
+ */
+ public final ModifiableDBIDs ids;
+
+ /**
+ * Constructor.
+ *
+ * @param clusterCore Signature
+ */
+ public ClusterCandidate(Signature clusterCore) {
+ this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+ for(int i = 0; i < clusterCore.spec.length; i += 2) {
+ this.dimensions.set(i >> 1);
+ }
+ this.ids = DBIDUtil.newArray(clusterCore.ids.size());
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the chi squared test threshold.
+ */
+ public static final OptionID ALPHA_THRESHOLD_ID = new OptionID("p3c.alpha", "The significance level for uniform testing in the initial binning step.");
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ public static final OptionID POISSON_THRESHOLD_ID = new OptionID("p3c.threshold", "The threshold value for the poisson test used when merging signatures.");
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ public static final OptionID MAX_EM_ITERATIONS_ID = new OptionID("p3c.em.maxiter", "The maximum number of iterations for the EM step. Use -1 to run until delta convergence.");
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ public static final OptionID EM_DELTA_ID = new OptionID("p3c.em.delta", "The change delta for the EM step below which to stop.");
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ public static final OptionID MIN_CLUSTER_SIZE_ID = new OptionID("p3c.minsize", "The minimum size of a cluster, otherwise it is seen as noise (this is a cheat, it is not mentioned in the paper).");
+
+ /**
+ * Parameter for the chi squared test threshold.
+ *
+ * While statistical values such as 0.01 are a good choice, we found the
+ * need to modify this parameter in our experiments.
+ */
+ protected double alpha;
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_THRESHOLD_ID, .001);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(POISSON_THRESHOLD_ID, 1.e-4);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ poissonThreshold = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MAX_EM_ITERATIONS_ID, 20);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_MINUSONE_INT);
+ if(config.grab(param)) {
+ maxEmIterations = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(EM_DELTA_ID, 1.e-5);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ emDelta = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MIN_CLUSTER_SIZE_ID, 1);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ minClusterSize = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected P3C<V> makeInstance() {
+ return new P3C<>(alpha, poissonThreshold, maxEmIterations, emDelta, minClusterSize);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
index 92158734..03e9978f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
@@ -67,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -148,7 +148,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database);
RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
- final Random random = rnd.getRandom();
+ final Random random = rnd.getSingleThreadedRandom();
if (RelationUtil.dimensionality(relation) < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")");
@@ -844,7 +844,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
configL(config);
IntParameter m_iP = new IntParameter(M_I_ID, 10);
- m_iP.addConstraint(new GreaterConstraint(0));
+ m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(m_iP)) {
m_i = m_iP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
index c8d0833e..e6245f6e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
@@ -54,7 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -77,7 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Elke Achtert
*
* @apiviz.uses DBSCAN
- * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction
+ * @apiviz.uses DimensionSelectingSubspaceDistanceFunction
* @apiviz.has SubspaceModel
*
* @param <V> the type of FeatureVector handled by this Algorithm
@@ -488,7 +488,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(minptsP)) {
minpts = minptsP.getValue();
}