diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
48 files changed, 5330 insertions, 730 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index 0c4eb5fc..96c95a9f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistance import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -152,8 +152,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext */ protected void configK(Parameterization config) { IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } } @@ -165,8 +165,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext */ protected void configKI(Parameterization config) { IntParameter k_iP = new IntParameter(K_I_ID, 30); - k_iP.addConstraint(new GreaterConstraint(0)); - if (config.grab(k_iP)) { + k_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(k_iP)) { k_i = k_iP.getValue(); } } @@ -178,8 +178,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext */ protected void configL(Parameterization config) { IntParameter lP = new IntParameter(L_ID); - lP.addConstraint(new GreaterConstraint(0)); - if (config.grab(lP)) { + lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(lP)) { l = lP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index ee3b234c..52e37197 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -294,7 +294,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); ModifiableDBIDs seeds = DBIDUtil.newHashSet(); - for (DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) { + for(DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) { int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension(); // nextID is not reachable from start object if(nextID_corrDim > lambda) { @@ -322,9 +322,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext DistanceDBIDList<DoubleDistance> reachables = rangeQuery.getRangeForDBID(iter, epsilon); iter.remove(); - + if(reachables.size() > minpts) { - for (DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) { + for(DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) { int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension(); // r is not reachable from q if(corrDim_r > lambda) { @@ -351,9 +351,10 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } } - /* if(processedIDs.size() == relation.size() && noise.size() == 0) { - break; - } */ + /* + * if(processedIDs.size() == relation.size() && noise.size() == 0) { + * break; } + */ } if(currentCluster.size() >= minpts) { @@ -375,7 +376,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(distanceFunction.getInputTypeRestriction()); } - + /** * Parameterization class. * @@ -411,7 +412,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext protected void configMinPts(Parameterization config) { IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } @@ -435,7 +436,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext protected void configLambda(Parameterization config) { IntParameter lambdaP = new IntParameter(LAMBDA_ID); - lambdaP.addConstraint(new GreaterConstraint(0)); + lambdaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(lambdaP)) { lambda = lambdaP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index 57dcb435..09c78fec 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -38,9 +38,8 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -52,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -82,24 +81,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor private static final Logging LOG = Logging.getLogger(DBSCAN.class); /** - * Parameter to specify the maximum radius of the neighborhood to be - * considered, must be suitable to the distance function specified. + * Holds the epsilon radius threshold. */ - public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered."); + protected D epsilon; /** - * Holds the value of {@link #EPSILON_ID}. - */ - private D epsilon; - - /** - * Parameter to specify the threshold for minimum number of points in the - * epsilon-neighborhood of a point, must be an integer greater than 0. - */ - public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); - - /** - * Holds the value of {@link #MINPTS_ID}. + * Holds the minimum cluster size. */ protected int minpts; @@ -146,7 +133,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor if(size < minpts) { // The can't be any clusters noise.addDBIDs(relation.getDBIDs()); - objprog.setProcessed(noise.size(), LOG); + if(objprog != null) { + objprog.setProcessed(noise.size(), LOG); + } } else { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { @@ -193,7 +182,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * @param objprog the progress object for logging the current status */ protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { - DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); + DBIDs neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); // startObject is no core-object if(neighbors.size() < minpts) { @@ -207,7 +196,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } // try to expand the cluster - HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet(); + ModifiableDBIDs seeds = DBIDUtil.newHashSet(); ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DBIDIter seed = neighbors.iter(); seed.valid(); seed.advance()) { if(!processedIDs.contains(seed)) { @@ -222,9 +211,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } seeds.remove(startObjectID); - while(seeds.size() > 0) { + while(!seeds.isEmpty()) { DBIDMIter o = seeds.iter(); - DistanceDBIDList<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon); + DBIDs neighborhood = rangeQuery.getRangeForDBID(o, epsilon); o.remove(); if(neighborhood.size() >= minpts) { @@ -282,6 +271,18 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * @apiviz.exclude */ public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Parameter to specify the maximum radius of the neighborhood to be + * considered, must be suitable to the distance function specified. + */ + public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered."); + + /** + * Parameter to specify the threshold for minimum number of points in the + * epsilon-neighborhood of a point, must be an integer greater than 0. + */ + public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); + protected D epsilon = null; protected int minpts = 0; @@ -295,7 +296,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } @@ -306,4 +307,4 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor return new DBSCAN<>(distanceFunction, epsilon, minpts); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index 3c2e0278..814b4cc4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -496,7 +496,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index c66442a1..e82ec674 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.EMModel; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; @@ -41,14 +42,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; @@ -57,8 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -72,8 +73,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * zero-covariance and variance=1 in covariance matrices. * </p> * <p> - * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin: Maximum Likelihood from - * Incomplete Data via the EM algorithm. <br> + * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin:<br /> + * Maximum Likelihood from Incomplete Data via the EM algorithm.<br> * In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31 * </p> * @@ -100,48 +101,36 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< private static final double SINGULARITY_CHEAT = 1E-9; /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find."); - - /** - * Holds the value of {@link #K_ID}. + * Number of clusters */ private int k; /** - * Parameter to specify the termination criterion for maximization of E(M): - * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. + * Delta parameter */ - public static final OptionID DELTA_ID = new OptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta"); + private double delta; /** - * Parameter to specify the initialization method + * Class to choose the initial means */ - public static final OptionID INIT_ID = new OptionID("kmeans.initialization", "Method to choose the initial means."); - - private static final double MIN_LOGLIKELIHOOD = -100000; + private KMeansInitialization<V> initializer; /** - * Holds the value of {@link #DELTA_ID}. + * Maximum number of iterations to allow */ - private double delta; + private int maxiter; /** - * Store the individual probabilities, for use by EMOutlierDetection etc. + * Retain soft assignments. */ - private WritableDataStore<double[]> probClusterIGivenX; + private boolean soft; - /** - * Class to choose the initial means - */ - private KMeansInitialization<V> initializer; + private static final double MIN_LOGLIKELIHOOD = -100000; /** - * Maximum number of iterations to allow + * Soft assignment result type. */ - private int maxiter; + public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class); /** * Constructor. @@ -150,13 +139,15 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @param delta delta parameter * @param initializer Class to choose the initial means * @param maxiter Maximum number of iterations + * @param soft Include soft assignments */ - public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter) { + public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter, boolean soft) { super(); this.k = k; this.delta = delta; this.initializer = initializer; this.maxiter = maxiter; + this.setSoft(soft); } /** @@ -172,137 +163,80 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @return Result */ public Clustering<EMModel<V>> run(Database database, Relation<V> relation) { - if (relation.size() == 0) { + if(relation.size() == 0) { throw new IllegalArgumentException("database empty: must contain elements"); } // initial models - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("initializing " + k + " models"); } - List<Vector> means = new ArrayList<>(); - for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC)) { - means.add(nv.getColumnVector()); + final List<V> initialMeans = initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC); + assert (initialMeans.size() == k); + Vector[] means = new Vector[k]; + { + int i = 0; + for(NumberVector<?> nv : initialMeans) { + means[i] = nv.getColumnVector(); + i++; + } } - List<Matrix> covarianceMatrices = new ArrayList<>(k); + Matrix[] covarianceMatrices = new Matrix[k]; double[] normDistrFactor = new double[k]; - List<Matrix> invCovMatr = new ArrayList<>(k); + Matrix[] invCovMatr = new Matrix[k]; double[] clusterWeights = new double[k]; - probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); + WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); - final int dimensionality = means.get(0).getDimensionality(); - for (int i = 0; i < k; i++) { + final int dimensionality = means[0].getDimensionality(); + final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality); + for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); - covarianceMatrices.add(m); - final double det = m.det(); - if (det > 0.) { - normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det); - } else { - LOG.warning("Encountered matrix with 0 determinant - degenerated."); - normDistrFactor[i] = 1.0; // Not really well defined - } - invCovMatr.add(m.inverse()); + covarianceMatrices[i] = m; + normDistrFactor[i] = 1.0 / Math.sqrt(norm); + invCovMatr[i] = Matrix.identity(dimensionality, dimensionality); clusterWeights[i] = 1.0 / k; - if (LOG.isDebuggingFinest()) { - StringBuilder msg = new StringBuilder(); - msg.append(" model ").append(i).append(":\n"); - msg.append(" mean: ").append(means.get(i)).append('\n'); - msg.append(" m:\n").append(FormatUtil.format(m, " ")).append('\n'); - msg.append(" m.det(): ").append(det).append('\n'); - msg.append(" cluster weight: ").append(clusterWeights[i]).append('\n'); - msg.append(" normDistFact: ").append(normDistrFactor[i]).append('\n'); - LOG.debugFine(msg.toString()); - } } double emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX); // iteration unless no change - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("iterating EM"); } - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("iteration " + 0 + " - expectation value: " + emNew); } - double em; - for (int it = 1; it <= maxiter || maxiter < 0; it++) { - em = emNew; - - // recompute models - List<Vector> meanSums = new ArrayList<>(k); - double[] sumOfClusterProbabilities = new double[k]; - - for (int i = 0; i < k; i++) { - clusterWeights[i] = 0.0; - meanSums.add(new Vector(dimensionality)); - covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality)); - } - - // weights and means - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - double[] clusterProbabilities = probClusterIGivenX.get(iditer); - - for (int i = 0; i < k; i++) { - sumOfClusterProbabilities[i] += clusterProbabilities[i]; - Vector summand = relation.get(iditer).getColumnVector().timesEquals(clusterProbabilities[i]); - meanSums.get(i).plusEquals(summand); - } - } - final int n = relation.size(); - for (int i = 0; i < k; i++) { - clusterWeights[i] = sumOfClusterProbabilities[i] / n; - Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]); - means.set(i, newMean); - } - // covariance matrices - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - double[] clusterProbabilities = probClusterIGivenX.get(iditer); - Vector instance = relation.get(iditer).getColumnVector(); - for (int i = 0; i < k; i++) { - Vector difference = instance.minus(means.get(i)); - covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i])); - } - } - for (int i = 0; i < k; i++) { - covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT)); - } - for (int i = 0; i < k; i++) { - final double det = covarianceMatrices.get(i).det(); - if (det > 0.) { - normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det); - } else { - LOG.warning("Encountered matrix with 0 determinant - degenerated."); - normDistrFactor[i] = 1.0; // Not really well defined - } - invCovMatr.set(i, covarianceMatrices.get(i).inverse()); - } + for(int it = 1; it <= maxiter || maxiter < 0; it++) { + final double emOld = emNew; + recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dimensionality); + computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm); // reassign probabilities emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("iteration " + it + " - expectation value: " + emNew); } - if (Math.abs(em - emNew) <= delta) { + if(Math.abs(emOld - emNew) <= delta) { break; } } - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("assigning clusters"); } // fill result with clusters and models List<ModifiableDBIDs> hardClusters = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { hardClusters.add(DBIDUtil.newHashSet()); } // provide a hard clustering - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double[] clusterProbabilities = probClusterIGivenX.get(iditer); int maxIndex = 0; double currentMax = 0.0; - for (int i = 0; i < k; i++) { - if (clusterProbabilities[i] > currentMax) { + for(int i = 0; i < k; i++) { + if(clusterProbabilities[i] > currentMax) { maxIndex = i; currentMax = clusterProbabilities[i]; } @@ -312,24 +246,89 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Clustering<EMModel<V>> result = new Clustering<>("EM Clustering", "em-clustering"); // provide models within the result - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { // TODO: re-do labeling. // SimpleClassLabel label = new SimpleClassLabel(); // label.init(result.canonicalClusterLabel(i)); - Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i))); + Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means[i].getArrayRef()), covarianceMatrices[i])); result.addToplevelCluster(model); } + if(isSoft()) { + result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs())); + } + else { + probClusterIGivenX.destroy(); + } return result; } /** + * Compute the inverse cluster matrices. + * + * @param covarianceMatrices Input covariance matrices + * @param invCovMatr Output array for inverse matrices + * @param normDistrFactor Output array for norm distribution factors. + * @param norm Normalization factor, usually (2pi)^d + */ + public static void computeInverseMatrixes(Matrix[] covarianceMatrices, Matrix[] invCovMatr, double[] normDistrFactor, final double norm) { + int k = covarianceMatrices.length; + for(int i = 0; i < k; i++) { + final double det = covarianceMatrices[i].det(); + if(det > 0.) { + normDistrFactor[i] = 1. / Math.sqrt(norm * det); + } + else { + LOG.warning("Encountered matrix with 0 determinant - degenerated."); + normDistrFactor[i] = 1.; // Not really well defined + } + invCovMatr[i] = covarianceMatrices[i].inverse(); + } + } + + /** + * Recompute the covariance matrixes. + * + * @param relation Vector data + * @param probClusterIGivenX Object probabilities + * @param means Cluster means output + * @param covarianceMatrices Output covariance matrixes + * @param dimensionality Data set dimensionality + */ + public static void recomputeCovarianceMatrices(Relation<? extends NumberVector<?>> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] covarianceMatrices, final int dimensionality) { + final int k = means.length; + CovarianceMatrix[] cms = new CovarianceMatrix[k]; + for(int i = 0; i < k; i++) { + cms[i] = new CovarianceMatrix(dimensionality); + } + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); + Vector instance = relation.get(iditer).getColumnVector(); + for(int i = 0; i < k; i++) { + if(clusterProbabilities[i] > 0.) { + cms[i].put(instance, clusterProbabilities[i]); + } + } + } + for(int i = 0; i < k; i++) { + if(cms[i].getWeight() <= 0.) { + means[i] = new Vector(dimensionality); + covarianceMatrices[i] = Matrix.identity(dimensionality, dimensionality); + } + else { + means[i] = cms[i].getMeanVector(); + covarianceMatrices[i] = cms[i].destroyToNaiveMatrix().cheatToAvoidSingularity(SINGULARITY_CHEAT); + } + } + } + + /** * Assigns the current probability values to the instances in the database and * compute the expectation value of the current mixture of distributions. * * Computed as the sum of the logarithms of the prior probability of each * instance. * - * @param database the database used for assignment to instances + * @param relation the database used for assignment to instances * @param normDistrFactor normalization factor for density function, based on * current covariance matrix * @param means the current means @@ -337,58 +336,55 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @param clusterWeights the weights of the current clusters * @return the expectation value of the current mixture of distributions */ - protected double assignProbabilitiesToInstances(Relation<V> database, double[] normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { - double emSum = 0.0; + public static double assignProbabilitiesToInstances(Relation<? extends NumberVector<?>> relation, double[] normDistrFactor, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { + final int k = clusterWeights.length; + double emSum = 0.; - for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { - Vector x = database.get(iditer).getColumnVector(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + Vector x = relation.get(iditer).getColumnVector(); double[] probabilities = new double[k]; - for (int i = 0; i < k; i++) { - Vector difference = x.minus(means.get(i)); - double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference); - double power = rowTimesCovTimesCol / 2.0; + for(int i = 0; i < k; i++) { + Vector difference = x.minus(means[i]); + double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr[i], difference); + double power = rowTimesCovTimesCol / 2.; double prob = normDistrFactor[i] * Math.exp(-power); - if (LOG.isDebuggingFinest()) { - LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); + if(LOG.isDebuggingFinest()) { + LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + // + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + // + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + // + " power= " + power + "\n" + " prob=" + prob + "\n" + // + " inv cov matrix: \n" + FormatUtil.format(invCovMatr[i], " ")); } - if (!(prob >= 0.)) { + if(!(prob >= 0.)) { LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor[i]); + prob = 0.; } probabilities[i] = prob; } - double priorProbability = 0.0; - for (int i = 0; i < k; i++) { + double priorProbability = 0.; + for(int i = 0; i < k; i++) { priorProbability += probabilities[i] * clusterWeights[i]; } double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD); - if (!Double.isNaN(logP)) { + if(!Double.isNaN(logP)) { emSum += logP; } double[] clusterProbabilities = new double[k]; - for (int i = 0; i < k; i++) { - assert (clusterWeights[i] >= 0.0); + for(int i = 0; i < k; i++) { + assert (clusterWeights[i] >= 0.); // do not divide by zero! - if (priorProbability > 0.0) { + if(priorProbability > 0.) { clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i]; - } else { - clusterProbabilities[i] = 0.0; + } + else { + clusterProbabilities[i] = 0.; } } probClusterIGivenX.put(iditer, clusterProbabilities); } - return emSum; - } - - /** - * Get the probabilities for a given point. - * - * @param index Point ID - * @return Probabilities of given point - */ - public double[] getProbClusterIGivenX(DBIDRef index) { - return probClusterIGivenX.get(index); + return emSum / relation.size(); } @Override @@ -402,6 +398,20 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< } /** + * @return the soft + */ + public boolean isSoft() { + return soft; + } + + /** + * @param soft the soft to set + */ + public void setSoft(boolean soft) { + this.soft = soft; + } + + /** * Parameterization class. * * @author Erich Schubert @@ -409,45 +419,77 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find."); + + /** + * Parameter to specify the termination criterion for maximization of E(M): + * E(M) - E(M') < em.delta, must be a double equal to or greater than 0. + */ + public static final OptionID DELTA_ID = new OptionID("em.delta", // + "The termination criterion for maximization of E(M): " + // + "E(M) - E(M') < em.delta"); + + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = new OptionID("kmeans.initialization", // + "Method to choose the initial means."); + + /** + * Number of clusters. + */ protected int k; + /** + * Stopping threshold + */ protected double delta; + /** + * Initialization method + */ protected KMeansInitialization<V> initializer; + /** + * Maximum number of iterations. + */ protected int maxiter = -1; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.0); - deltaP.addConstraint(new GreaterEqualConstraint(0.0)); - if (config.grab(deltaP)) { + deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(deltaP)) { delta = deltaP.getValue(); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); maxiterP.setOptional(true); - if (config.grab(maxiterP)) { + if(config.grab(maxiterP)) { maxiter = maxiterP.getValue(); } } @Override protected EM<V> makeInstance() { - return new EM<>(k, delta, initializer, maxiter); + return new EM<>(k, delta, initializer, maxiter, false); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java index e928d041..a4a922df 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java @@ -33,10 +33,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; -import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -146,7 +146,8 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor // boxing/unboxing. for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if(!processedIDs.contains(iditer)) { - // We need to do some ugly casts to be able to run the optimized version, unfortunately. + // We need to do some ugly casts to be able to run the optimized + // version, unfortunately. @SuppressWarnings("unchecked") final ClusterOrderResult<DoubleDistance> doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder); @SuppressWarnings("unchecked") @@ -304,7 +305,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java index 583d402b..db343f3a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java @@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; @@ -240,6 +239,10 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< // By default, clusters cover both the steep up and steep down area int cstart = sda.getStartIndex(); int cend = sua.getEndIndex(); + // Hotfix: never include infinity-reachable points at the end + while(cend > cstart && Double.isInfinite(clusterOrder.get(cend).getReachability().doubleValue())) { + --cend; + } // However, we sometimes have to adjust this (Condition 4): { // Case b) @@ -654,8 +657,8 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter xiP = new DoubleParameter(XI_ID); - xiP.addConstraint(new GreaterEqualConstraint(0.0)); - xiP.addConstraint(new LessConstraint(1.0)); + xiP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + xiP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); if(config.grab(xiP)) { xi = xiP.doubleValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index 95d9f23c..86bb9a09 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -328,7 +328,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java new file mode 100644 index 00000000..68dacf34 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java @@ -0,0 +1,350 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.iterator.TIntObjectIterator; +import gnu.trove.map.hash.TIntObjectHashMap; +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Cluster analysis by affinity propagation. + * + * Reference: + * <p> + * Clustering by Passing Messages Between Data Points<br /> + * B. J. Frey and D. Dueck<br /> + * Science Vol 315 + * </p> + * + * @author Erich Schubert + * + * @apiviz.composedOf AffinityPropagationInitialization + * + * @param <O> object type + */ +@Title("Affinity Propagation: Clustering by Passing Messages Between Data Points") +@Reference(title = "Clustering by Passing Messages Between Data Points", authors = "B. J. Frey and D. Dueck", booktitle = "Science Vol 315", url = "http://dx.doi.org/10.1126/science.1136800") +public class AffinityPropagationClusteringAlgorithm<O> extends AbstractAlgorithm<Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { + /** + * Class logger + */ + private static final Logging LOG = Logging.getLogger(AffinityPropagationClusteringAlgorithm.class); + + /** + * Similarity initialization + */ + AffinityPropagationInitialization<O> initialization; + + /** + * Damping factor lambda. + */ + double lambda = 0.5; + + /** + * Terminate after 10 iterations with no changes. + */ + int convergence = 10; + + /** + * Maximum number of iterations. + */ + int maxiter = 1000; + + /** + * Constructor. + * + * @param initialization Similarity initialization + * @param lambda Damping factor + * @param convergence Termination threshold (Number of stable iterations) + * @param maxiter Maximum number of iterations + */ + public AffinityPropagationClusteringAlgorithm(AffinityPropagationInitialization<O> initialization, double lambda, int convergence, int maxiter) { + super(); + this.initialization = initialization; + this.lambda = lambda; + this.convergence = convergence; + this.maxiter = maxiter; + } + + /** + * Perform affinity propagation clustering. + * + * @param db Database + * @param relation Relation + * @return Clustering result + */ + public Clustering<MedoidModel> run(Database db, Relation<O> relation) { + ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); + final int size = ids.size(); + + int[] assignment = new int[size]; + double[][] s = initialization.getSimilarityMatrix(db, relation, ids); + double[][] r = new double[size][size]; + double[][] a = new double[size][size]; + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null; + MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null; + + int inactive = 0; + for(int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) { + // Update responsibility matrix: + for(int i = 0; i < size; i++) { + double[] ai = a[i], ri = r[i], si = s[i]; + // Find the two largest values (as initially maxk == i) + double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY; + int maxk = -1; + for(int k = 0; k < size; k++) { + double val = ai[k] + si[k]; + if(val > max1) { + max2 = max1; + max1 = val; + maxk = k; + } + else if(val > max2) { + max2 = val; + } + } + // With the maximum value known, update r: + for(int k = 0; k < size; k++) { + double val = si[k] - ((k != maxk) ? max1 : max2); + ri[k] = ri[k] * lambda + val * (1. - lambda); + } + } + // Update availability matrix + for(int k = 0; k < size; k++) { + // Compute sum of max(0, r_ik) for all i. + // For r_kk, don't apply the max. + double colposum = 0.; + for(int i = 0; i < size; i++) { + if(i == k || r[i][k] > 0.) { + colposum += r[i][k]; + } + } + for(int i = 0; i < size; i++) { + double val = colposum; + // Adjust column sum by the one extra term. + if(i == k || r[i][k] > 0.) { + val -= r[i][k]; + } + if(i != k && val > 0.) { // min + val = 0.; + } + a[i][k] = a[i][k] * lambda + val * (1 - lambda); + } + } + int changed = 0; + for(int i = 0; i < size; i++) { + double[] ai = a[i], ri = r[i]; + double max = Double.NEGATIVE_INFINITY; + int maxj = -1; + for(int j = 0; j < size; j++) { + double v = ai[j] + ri[j]; + if(v > max || (i == j && v >= max)) { + max = v; + maxj = j; + } + } + if(assignment[i] != maxj) { + changed += 1; + assignment[i] = maxj; + } + } + inactive = (changed > 0) ? 0 : (inactive + 1); + if(prog != null) { + prog.incrementProcessed(LOG); + } + if(aprog != null) { + aprog.setProcessed(size - changed, LOG); + } + } + if(aprog != null) { + aprog.setProcessed(aprog.getTotal(), LOG); + } + if(prog != null) { + prog.setCompleted(LOG); + } + // Cluster map, by lead object + TIntObjectHashMap<ModifiableDBIDs> map = new TIntObjectHashMap<>(); + DBIDArrayIter i1 = ids.iter(); + for(int i = 0; i1.valid(); i1.advance(), i++) { + int c = assignment[i]; + // Add to cluster members: + ModifiableDBIDs cids = map.get(c); + if(cids == null) { + cids = DBIDUtil.newArray(); + map.put(c, cids); + } + cids.add(i1); + } + // If we stopped early, the cluster lead might be in a different cluster. + for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) { + iter.advance(); // Trove iterator; advance first! + final int key = iter.key(); + int targetkey = key; + ModifiableDBIDs tids = null; + // Chase arrows: + while(ids == null && assignment[targetkey] != targetkey) { + targetkey = assignment[targetkey]; + tids = map.get(targetkey); + } + if(tids != null && targetkey != key) { + tids.addDBIDs(iter.value()); + iter.remove(); + } + } + + Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering"); + ModifiableDBIDs noise = DBIDUtil.newArray(); + for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) { + iter.advance(); // Trove iterator; advance first! + i1.seek(iter.key()); + if(iter.value().size() > 1) { + MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1)); + clustering.addToplevelCluster(new Cluster<>(iter.value(), mod)); + } + else { + noise.add(i1); + } + } + if(noise.size() > 0) { + MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter())); + clustering.addToplevelCluster(new Cluster<>(noise, true, mod)); + } + return clustering; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(initialization.getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> object type + */ + public static class Parameterizer<O> extends AbstractParameterizer { + /** + * Parameter for the similarity matrix initialization + */ + public static final OptionID INITIALIZATION_ID = new OptionID("ap.initialization", "Similarity matrix initialization.."); + + /** + * Parameter for the dampening factor. + */ + public static final OptionID LAMBDA_ID = new OptionID("ap.lambda", "Dampening factor lambda. Usually 0.5 to 1."); + + /** + * Parameter for the convergence factor. + */ + public static final OptionID CONVERGENCE_ID = new OptionID("ap.convergence", "Number of stable iterations for convergence."); + + /** + * Parameter for the convergence factor. + */ + public static final OptionID MAXITER_ID = new OptionID("ap.maxiter", "Maximum number of iterations."); + + /** + * Initialization function for the similarity matrix. + */ + AffinityPropagationInitialization<O> initialization; + + /** + * Dampening parameter. + */ + double lambda = .5; + + /** + * Number of stable iterations for convergence. + */ + int convergence; + + /** + * Maximum number of iterations. + */ + int maxiter; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final ObjectParameter<AffinityPropagationInitialization<O>> param = new ObjectParameter<>(INITIALIZATION_ID, AffinityPropagationInitialization.class, DistanceBasedInitializationWithMedian.class); + if(config.grab(param)) { + initialization = param.instantiateClass(config); + } + final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, .5); + lambdaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + lambdaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); + if(config.grab(lambdaP)) { + lambda = lambdaP.doubleValue(); + } + final IntParameter convergenceP = new IntParameter(CONVERGENCE_ID, 15); + convergenceP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(convergenceP)) { + convergence = convergenceP.intValue(); + } + final IntParameter maxiterP = new IntParameter(MAXITER_ID, 1000); + if(config.grab(maxiterP)) { + maxiter = maxiterP.intValue(); + } + } + + @Override + protected AffinityPropagationClusteringAlgorithm<O> makeInstance() { + return new AffinityPropagationClusteringAlgorithm<>(initialization, lambda, convergence, maxiter); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java new file mode 100644 index 00000000..5dbc54de --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java @@ -0,0 +1,59 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable; + +/** + * Initialization methods for affinity propagation. + * + * @author Erich Schubert + */ +public interface AffinityPropagationInitialization<O> extends Parameterizable { + /** + * Quantile to use for the diagonal entries. + */ + public static final OptionID QUANTILE_ID = new OptionID("ap.quantile", "Quantile to use for diagonal entries."); + + /** + * Compute the initial similarity matrix. + * + * @param db Database + * @param relation Data relation + * @param ids indexed DBIDs + * @return Similarity matrix + */ + double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids); + + /** + * Get the data type information for the similarity computations. + * + * @return Data type + */ + TypeInformation getInputTypeRestriction(); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java new file mode 100644 index 00000000..2c8cabf9 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java @@ -0,0 +1,148 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Distance based initialization. + * + * @author Erich Schubert + * + * @param <O> Object type + * @param <D> Distance type + */ +public class DistanceBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> { + /** + * Distance function. + */ + DistanceFunction<? super O, D> distance; + + /** + * Quantile to use. + */ + double quantile; + + /** + * Constructor. + * + * @param distance Similarity function + * @param quantile Quantile + */ + public DistanceBasedInitializationWithMedian(DistanceFunction<? super O, D> distance, double quantile) { + super(); + this.distance = distance; + this.quantile = quantile; + } + + @Override + public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) { + final int size = ids.size(); + DistanceQuery<O, D> dq = db.getDistanceQuery(relation, distance); + double[][] mat = new double[size][size]; + double[] flat = new double[(size * (size - 1)) >> 1]; + // TODO: optimize for double valued primitive distances. + DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); + for (int i = 0, j = 0; i < size; i++, i1.advance()) { + double[] mati = mat[i]; + i2.seek(i + 1); + for (int k = i + 1; k < size; k++, i2.advance()) { + mati[k] = -dq.distance(i1, i2).doubleValue(); + mat[k][i] = mati[k]; // symmetry. + flat[j] = mati[k]; + j++; + } + } + double median = QuickSelect.quantile(flat, quantile); + // On the diagonal, we place the median + for (int i = 0; i < size; i++) { + mat[i][i] = median; + } + return mat; + } + + @Override + public TypeInformation getInputTypeRestriction() { + return distance.getInputTypeRestriction(); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> Object type + * @param <D> Distance type + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + /** + * Parameter for the distance function. + */ + public static final OptionID DISTANCE_ID = new OptionID("ap.distance", "Distance function to use."); + + /** + * istance function. + */ + DistanceFunction<? super O, D> distance; + + /** + * Quantile to use. + */ + double quantile; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<DistanceFunction<? super O, D>> param = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class, SquaredEuclideanDistanceFunction.class); + if (config.grab(param)) { + distance = param.instantiateClass(config); + } + + DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5); + if (config.grab(quantileP)) { + quantile = quantileP.doubleValue(); + } + } + + @Override + protected DistanceBasedInitializationWithMedian<O, D> makeInstance() { + return new DistanceBasedInitializationWithMedian<>(distance, quantile); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java new file mode 100644 index 00000000..a138da96 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java @@ -0,0 +1,153 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.LinearKernelFunction; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Similarity based initialization. + * + * @author Erich Schubert + * + * @param <O> Object type + * @param <D> Distance type + */ +public class SimilarityBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> { + /** + * Similarity function. + */ + SimilarityFunction<? super O, D> similarity; + + /** + * Quantile to use. + */ + double quantile; + + /** + * Constructor. + * + * @param similarity Similarity function + * @param quantile Quantile + */ + public SimilarityBasedInitializationWithMedian(SimilarityFunction<? super O, D> similarity, double quantile) { + super(); + this.similarity = similarity; + this.quantile = quantile; + } + + @Override + public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) { + final int size = ids.size(); + SimilarityQuery<O, D> sq = db.getSimilarityQuery(relation, similarity); + double[][] mat = new double[size][size]; + double[] flat = new double[(size * (size - 1)) >> 1]; + // TODO: optimize for double valued primitive distances. + DBIDArrayIter i1 = ids.iter(), i2 = ids.iter(); + // Compute self-similarities first, for centering: + for (int i = 0; i < size; i++, i1.advance()) { + mat[i][i] = sq.similarity(i1, i1).doubleValue() * .5; + } + i1.seek(0); + for (int i = 0, j = 0; i < size; i++, i1.advance()) { + final double[] mati = mat[i]; // Probably faster access. + i2.seek(i + 1); + for (int k = i + 1; k < size; k++, i2.advance()) { + mati[k] = sq.similarity(i1, i2).doubleValue() - mati[i] - mat[k][k]; + mat[k][i] = mati[k]; // symmetry. + flat[j] = mati[k]; + j++; + } + } + double median = QuickSelect.quantile(flat, quantile); + // On the diagonal, we place the median + for (int i = 0; i < size; i++) { + mat[i][i] = median; + } + return mat; + } + + @Override + public TypeInformation getInputTypeRestriction() { + return similarity.getInputTypeRestriction(); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> Object type + * @param <D> Distance type + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + /** + * Parameter for the similarity function. + */ + public static final OptionID SIMILARITY_ID = new OptionID("ap.similarity", "Similarity function to use."); + + /** + * Similarity function. + */ + SimilarityFunction<? super O, D> similarity; + + /** + * Quantile to use. + */ + double quantile; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<SimilarityFunction<? super O, D>> param = new ObjectParameter<>(SIMILARITY_ID, SimilarityFunction.class, LinearKernelFunction.class); + if (config.grab(param)) { + similarity = param.instantiateClass(config); + } + + DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5); + if (config.grab(quantileP)) { + quantile = quantileP.doubleValue(); + } + } + + @Override + protected SimilarityBasedInitializationWithMedian<O, D> makeInstance() { + return new SimilarityBasedInitializationWithMedian<>(similarity, quantile); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java new file mode 100644 index 00000000..bc6059ac --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java @@ -0,0 +1,27 @@ +/** + * Affinity Propagation (AP) clustering. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java new file mode 100644 index 00000000..8b875340 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java @@ -0,0 +1,302 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.BitSet; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.BiclusterModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; + +/** + * Abstract class as a convenience for different biclustering approaches. + * <p/> + * The typically required values describing submatrices are computed using the + * corresponding values within a database of NumberVectors. + * <p/> + * The database is supposed to present a data matrix with a row representing an + * entry ({@link NumberVector}), a column representing a dimension (attribute) + * of the {@link NumberVector}s. + * + * @author Arthur Zimek + * @param <V> a certain subtype of NumberVector - the data matrix is supposed to + * consist of rows where each row relates to an object of type V and the + * columns relate to the attribute values of these objects + * @param <M> Cluster model type + */ +public abstract class AbstractBiclustering<V extends NumberVector<?>, M extends BiclusterModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> { + /** + * Keeps the currently set database. + */ + private Database database; + + /** + * Relation we use. + */ + protected Relation<V> relation; + + /** + * Iterator to use for more efficient random access. + */ + private DBIDArrayIter iter; + + /** + * The row ids corresponding to the currently set {@link #relation}. + */ + protected ArrayDBIDs rowIDs; + + /** + * Column dimensionality. + */ + private int colDim; + + /** + * Constructor. + */ + protected AbstractBiclustering() { + super(); + } + + /** + * Prepares the algorithm for running on a specific database. + * <p/> + * Assigns the database, the row ids, and the col ids, then calls + * {@link #biclustering()}. + * <p/> + * Any concrete algorithm should be implemented within method + * {@link #biclustering()} by an inheriting biclustering approach. + * + * @param relation Relation to process + * @return Clustering result + */ + public final Clustering<M> run(Relation<V> relation) { + this.relation = relation; + if (this.relation == null || this.relation.size() == 0) { + throw new IllegalArgumentException(ExceptionMessages.DATABASE_EMPTY); + } + colDim = RelationUtil.dimensionality(relation); + rowIDs = DBIDUtil.ensureArray(this.relation.getDBIDs()); + iter = rowIDs.iter(); + return biclustering(); + } + + /** + * Run the actual biclustering algorithm. + * <p/> + * This method is supposed to be called only from the method + * {@link #run}. + * <p/> + */ + protected abstract Clustering<M> biclustering(); + + /** + * Convert a bitset into integer column ids. + * + * @param cols + * @return integer column ids + */ + protected int[] colsBitsetToIDs(BitSet cols) { + int[] colIDs = new int[cols.cardinality()]; + int colsIndex = 0; + for (int i = cols.nextSetBit(0); i >= 0; i = cols.nextSetBit(i + 1)) { + colIDs[colsIndex] = i; + colsIndex++; + } + return colIDs; + } + + /** + * Convert a bitset into integer row ids. + * + * @param rows + * @return integer row ids + */ + protected ArrayDBIDs rowsBitsetToIDs(BitSet rows) { + ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray(rows.cardinality()); + DBIDArrayIter iter = this.rowIDs.iter(); + for (int i = rows.nextSetBit(0); i >= 0; i = rows.nextSetBit(i + 1)) { + iter.seek(i); + rowIDs.add(iter); + } + return rowIDs; + } + + /** + * Defines a Bicluster as given by the included rows and columns. + * + * @param rows the rows included in the Bicluster + * @param cols the columns included in the Bicluster + * @return a Bicluster as given by the included rows and columns + */ + protected Cluster<BiclusterModel> defineBicluster(BitSet rows, BitSet cols) { + ArrayDBIDs rowIDs = rowsBitsetToIDs(rows); + int[] colIDs = colsBitsetToIDs(cols); + return new Cluster<>(rowIDs, new BiclusterModel(colIDs)); + } + + /** + * Defines a Bicluster as given by the included rows and columns. + * + * @param rows the rows included in the Bicluster + * @param cols the columns included in the Bicluster + * @return A Bicluster as given by the included rows and columns + */ + protected Cluster<BiclusterModel> defineBicluster(long[] rows, long[] cols) { + ArrayDBIDs rowIDs = rowsBitsetToIDs(rows); + int[] colIDs = colsBitsetToIDs(cols); + return new Cluster<>(rowIDs, new BiclusterModel(colIDs)); + } + + /** + * Returns the value of the data matrix at row <code>row</code> and column + * <code>col</code>. + * + * @param row the row in the data matrix according to the current order of + * rows (refers to database entry + * <code>database.get(rowIDs[row])</code>) + * @param col the column in the data matrix according to the current order of + * rows (refers to the attribute value of an database entry + * <code>getValue(colIDs[col])</code>) + * @return the attribute value of the database entry as retrieved by + * <code>database.get(rowIDs[row]).getValue(colIDs[col])</code> + */ + protected double valueAt(int row, int col) { + iter.seek(row); + return relation.get(iter).doubleValue(col); + } + + /** + * Get the DBID of a certain row + * + * @param row Row number + * @return DBID of this row + * @deprecated Expensive! + */ + @Deprecated + protected DBID getRowDBID(int row) { + return rowIDs.get(row); + } + + /** + * Convert a bitset into integer column ids. + * + * @param cols + * @return integer column ids + */ + protected int[] colsBitsetToIDs(long[] cols) { + int[] colIDs = new int[(int) BitsUtil.cardinality(cols)]; + int colsIndex = 0; + for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + if (clong == 0L) { + cpos += Long.SIZE; + continue; + } + for (int j = 0; j < Long.SIZE; ++j, ++cpos, clong >>>= 1) { + if ((clong & 1L) == 1L) { + colIDs[colsIndex] = cpos; + ++colsIndex; + } + } + } + return colIDs; + } + + /** + * Convert a bitset into integer row ids. + * + * @param rows + * @return integer row ids + */ + protected ArrayDBIDs rowsBitsetToIDs(long[] rows) { + ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray((int) BitsUtil.cardinality(rows)); + DBIDArrayIter iter = this.rowIDs.iter(); + outer: for (int rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if (rlong == 0L) { + iter.advance(Long.SIZE); + continue; + } + for (int i = 0; i < Long.SIZE; ++i, rlong >>>= 1, iter.advance()) { + if (!iter.valid()) { + break outer; + } + if ((rlong & 1L) == 1L) { + rowIDs.add(iter); + } + } + } + return rowIDs; + } + + /** + * Provides the number of rows of the data matrix. + * + * @return the number of rows of the data matrix + */ + protected int getRowDim() { + return this.rowIDs.size(); + } + + /** + * Provides the number of columns of the data matrix. + * + * @return the number of columns of the data matrix + */ + protected int getColDim() { + return colDim; + } + + /** + * Getter for database. + * + * @return database + */ + public Database getDatabase() { + return database; + } + + /** + * Getter for the relation. + * + * @return relation + */ + public Relation<V> getRelation() { + return relation; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java new file mode 100644 index 00000000..e110faff --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java @@ -0,0 +1,900 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Arrays; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Perform Cheng and Church biclustering. + * + * <p> + * Reference: <br> + * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of + * the 8th International Conference on Intelligent Systems for Molecular Biology + * (ISMB), San Diego, CA, 2000. + * </p> + * + * @author Erich Schubert + * @param <V> Vector type. + */ +@Reference(authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)") +public class ChengAndChurch<V extends NumberVector<?>> extends AbstractBiclustering<V, BiclusterWithInversionsModel> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(ChengAndChurch.class); + + /** + * The minimum number of columns that the database must have so that a removal + * of columns is performed in {@link #multipleNodeDeletion}.</p> + * <p> + * Just start deleting multiple columns when more than 100 columns are in the + * data matrix. + * </p> + */ + private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100; + + /** + * The minimum number of rows that the database must have so that a removal of + * rows is performed in {@link #multipleNodeDeletion}. + * <p> + * Just start deleting multiple rows when more than 100 rows are in the data + * matrix. + * </p> + * <!-- + * <p> + * The value is set to 100 as this is not really described in the paper. + * </p> + * --> + */ + private static final int MIN_ROW_REMOVE_THRESHOLD = 100; + + /** + * Threshold for the score. + */ + private double delta; + + /** + * The parameter for multiple node deletion.</p> + * <p> + * It is used to magnify the {@link #delta} value in the + * {@link #multipleNodeDeletion} method. + * </p> + */ + private double alpha; + + /** + * Number of biclusters to be found. + */ + private int n; + + /** + * Allow inversion of rows in the last phase. + */ + private boolean useinverted = true; + + /** + * Distribution to sample random replacement values from. + */ + private Distribution dist; + + /** + * Constructor. + * + * @param delta Delta parameter: desired quality + * @param alpha Alpha parameter: controls switching to single node deletion + * approach + * @param n Number of clusters to detect + * @param dist Distribution of random values to insert + */ + public ChengAndChurch(double delta, double alpha, int n, Distribution dist) { + super(); + this.delta = delta; + this.alpha = alpha; + this.n = n; + this.dist = dist; + } + + /** + * Visitor pattern for processing cells. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static interface CellVisitor { + /** Different modes of operation. */ + int ALL = 0, SELECTED = 1, NOT_SELECTED = 2; + + /** + * Visit a cell. + * + * @param val Value + * @param row Row Number + * @param col Column number + * @param selrow Boolean, whether row is selected + * @param selcol Boolean, whether column is selected + * @return Stop flag, return {@code true} to stop visiting + */ + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol); + } + + /** + * Bicluster candidate. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected static class BiclusterCandidate { + /** + * Cardinalities. + */ + int rowcard, colcard; + + /** + * Means. + */ + double[] rowM, colM; + + /** + * Row and column bitmasks. + */ + long[] rows, irow, cols; + + /** + * Mean of the current bicluster. + */ + double allM; + + /** + * The current bicluster score (mean squared residue). + */ + double residue; + + /** + * Constructor. + * + * @param rows Row dimensionality. + * @param cols Column dimensionality. + */ + protected BiclusterCandidate(int rows, int cols) { + super(); + this.rows = BitsUtil.ones(rows); + this.irow = BitsUtil.zero(rows); + this.rowcard = rows; + this.rowM = new double[rows]; + this.cols = BitsUtil.ones(cols); + this.colcard = cols; + this.colM = new double[cols]; + } + + /** + * Resets the values for the next cluster search. + */ + protected void reset() { + rows = BitsUtil.ones(rowM.length); + rowcard = rowM.length; + cols = BitsUtil.ones(colM.length); + colcard = colM.length; + BitsUtil.zeroI(irow); + } + + /** + * Visit all selected cells in the data matrix. + * + * @param mat Data matrix + * @param mode Operation mode + * @param visitor Visitor function + */ + protected void visitAll(double[][] mat, int mode, CellVisitor visitor) { + // For efficiency, we manually iterate over the rows and column bitmasks. + // This saves repeated shifting needed by the manual bit access. + for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) { + rpos += Long.SIZE; + continue; + } + for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { + boolean rselected = ((rlong & 1L) == 1L); + if((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) { + continue; + } + for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + if((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) { + cpos += Long.SIZE; + continue; + } + for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { + boolean cselected = ((clong & 1L) == 1L); + if((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) { + continue; + } + boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected); + if(stop) { + return; + } + } + } + } + } + } + + /** + * Visit a column of the matrix. + * + * @param mat Data matrix + * @param col Column to visit + * @param mode Operation mode + * @param visitor Visitor function + */ + protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) { + boolean cselected = BitsUtil.get(cols, col); + // For efficiency, we manually iterate over the rows and column bitmasks. + // This saves repeated shifting needed by the manual bit access. + for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if(mode == CellVisitor.SELECTED && rlong == 0L) { + rpos += Long.SIZE; + continue; + } + if(mode == CellVisitor.NOT_SELECTED && rlong == -1L) { + rpos += Long.SIZE; + continue; + } + for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { + boolean rselected = ((rlong & 1L) == 1L); + if(mode == CellVisitor.SELECTED && !rselected) { + continue; + } + if(mode == CellVisitor.NOT_SELECTED && rselected) { + continue; + } + boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected); + if(stop) { + return; + } + } + } + } + + /** + * Visit a row of the data matrix. + * + * @param mat Data matrix + * @param row Row to visit + * @param visitor Visitor function + */ + protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) { + boolean rselected = BitsUtil.get(rows, row); + final double[] rowdata = mat[row]; + for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + // Fast skip blocks of 64 masked values. + if(mode == CellVisitor.SELECTED && clong == 0L) { + cpos += Long.SIZE; + continue; + } + if(mode == CellVisitor.NOT_SELECTED && clong == -1L) { + cpos += Long.SIZE; + continue; + } + for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { + boolean cselected = ((clong & 1L) == 1L); + if(mode == CellVisitor.SELECTED && !cselected) { + continue; + } + if(mode == CellVisitor.NOT_SELECTED && cselected) { + continue; + } + boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected); + if(stop) { + return; + } + } + } + } + + /** Visitor for updating the means. */ + private final CellVisitor MEANVISITOR = new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + if(selcol) { + rowM[row] += val; + } + if(selrow) { + colM[col] += val; + } + if(selcol && selrow) { + allM += val; + } + return false; + } + }; + + /** + * Update the row means and column means. + * + * @param mat Data matrix + * @param all Flag, to update all + * @return overall mean + */ + protected double updateRowAndColumnMeans(final double[][] mat, boolean all) { + final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED; + Arrays.fill(rowM, 0.); + Arrays.fill(colM, 0.); + allM = 0.; + visitAll(mat, mode, MEANVISITOR); + visitColumn(mat, 0, mode, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + rowM[row] /= colcard; + return false; + } + }); + visitRow(mat, 0, mode, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + colM[col] /= rowcard; + return false; + } + }); + allM /= colcard * rowcard; + return allM; + } + + /** + * Compute the mean square residue. + * + * @param mat Data matrix + * @return mean squared residue + */ + protected double computeMeanSquaredDeviation(final double[][] mat) { + final Mean msr = new Mean(); + visitAll(mat, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow && selcol); + double v = val - rowM[row] - colM[col] + allM; + msr.put(v * v); + return false; + } + }); + residue = msr.getMean(); + return residue; + } + + /** + * Computes the <b>mean row residue</b> of the given <code>row</code>. + * + * @param mat Data matrix + * @param row The row who's residue should be computed. + * @param rowinverted Indicates if the row should be considered inverted. + * @return The row residue of the given <code>row</code>. + */ + protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) { + final Mean rowResidue = new Mean(); + visitRow(mat, row, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + final double rowMean = rowM[row]; + final double colMean = colM[col]; + double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM; + rowResidue.put(v * v); + return false; + } + }); + return rowResidue.getMean(); + } + + /** + * + * Computes the <b>mean column residue</b> of the given <code>col</code>. + * + * @param col The column who's residue should be computed. + * @return The row residue of the given <code>col</code>um. + */ + protected double computeColResidue(final double[][] mat, final int col) { + final double bias = colM[col] - allM; + final Mean colResidue = new Mean(); + visitColumn(mat, col, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + final double rowMean = rowM[row]; + double v = val - rowMean - bias; + colResidue.put(v * v); + return false; + } + }); + return colResidue.getMean(); + } + + /** + * Updates the mask with replacement values for all data in the given rows + * and columns. + * + * @param mat Mask to update. + * @param replacement Distribution to sample replacement values from. + */ + protected void maskMatrix(final double[][] mat, final Distribution replacement) { + visitAll(mat, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow && selcol); + mat[row][col] = replacement.nextRandom(); + return false; + } + }); + } + + /** + * Select or deselect a column. + * + * @param cnum Column to select + * @param set Value to set + */ + protected void selectColumn(int cnum, boolean set) { + if(set) { + BitsUtil.setI(cols, cnum); + colcard++; + } + else { + BitsUtil.clearI(cols, cnum); + colcard--; + } + } + + /** + * Select or deselect a row. + * + * @param rnum Row to select + * @param set Value to set + */ + protected void selectRow(int rnum, boolean set) { + if(set) { + BitsUtil.setI(rows, rnum); + rowcard++; + } + else { + BitsUtil.clearI(rows, rnum); + rowcard--; + } + } + + protected void invertRow(int rnum, boolean b) { + BitsUtil.setI(irow, rnum); + } + } + + @Override + public Clustering<BiclusterWithInversionsModel> biclustering() { + double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); + + BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); + + Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); + ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; + for(int i = 0; i < n; i++) { + cand.reset(); + multipleNodeDeletion(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + singleNodeDeletion(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + nodeAddition(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + cand.maskMatrix(mat, dist); + BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); + final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); + noise.removeDBIDs(cids); + result.addToplevelCluster(new Cluster<>(cids, model)); + + if(LOG.isVerbose()) { + LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); + LOG.verbose("Number of rows: " + cand.rowcard + "\n"); + LOG.verbose("Number of columns: " + cand.colcard + "\n"); + // LOG.verbose("Total number of masked values: " + maskedVals.size() + + // "\n"); + } + if(prog != null) { + prog.incrementProcessed(LOG); + } + } + // Add a noise cluster, full-dimensional. + if(!noise.isEmpty()) { + long[] allcols = BitsUtil.ones(getColDim()); + BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); + result.addToplevelCluster(new Cluster<>(noise, true, model)); + } + if(prog != null) { + prog.ensureCompleted(LOG); + } + return result; + } + + /** + * Algorithm 1 of Cheng and Church: + * + * Remove single rows or columns. + * + * Inverted rows are not supported in this method. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { + // Assume that cand.residue is up to date! + while(cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) { + // Store current maximum. Need final mutable, so use arrays. + final double[] max = { Double.NEGATIVE_INFINITY }; + final int[] best = { -1, -1 }; + + // Test rows + if(cand.rowcard > 2) { + cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + double rowResidue = cand.computeRowResidue(mat, row, false); + if(max[0] < rowResidue) { + max[0] = rowResidue; + best[0] = row; + } + return false; + } + }); + } + + // Test columns: + if(cand.colcard > 2) { + cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + double colResidue = cand.computeColResidue(mat, col); + if(max[0] < colResidue) { + max[0] = colResidue; + best[1] = col; + } + return false; + } + }); + } + + if(best[1] >= 0) { // then override bestrow! + cand.selectColumn(best[1], false); + } + else { + assert (best[0] >= 0); + cand.selectRow(best[0], false); + } + // TODO: incremental update could be much faster? + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + } + } + + // + /** + * Algorithm 2 of Cheng and Church. + * + * Remove all rows and columns that reduce the residue by alpha. + * + * Inverted rows are not supported in this method. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + + // Note: assumes that cand.residue = H(I,J) + while(cand.residue > delta) { + final boolean[] modified = { false, false }; + + // Step 2: remove rows above threshold + if(cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) { + final double alphaResidue = alpha * cand.residue; + cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + if(cand.computeRowResidue(mat, row, false) > alphaResidue) { + cand.selectRow(row, false); + modified[0] = true; + } + return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD); + } + }); + + // Step 3: update residue + if(modified[0]) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + } + } + + // Step 4: remove columns above threshold + if(cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) { + final double alphaResidue = alpha * cand.residue; + cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + if(cand.computeColResidue(mat, col) > alphaResidue) { + cand.selectColumn(col, false); + modified[1] = true; + } + return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD); + } + }); + if(modified[1]) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + } + } + + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + // Step 5: if nothing has been removed, try removing single nodes. + if(!modified[0] && !modified[1]) { + break; + // Will be executed next in main loop, as per algorithm 4. + // singleNodeDeletion(); + } + } + } + + /** + * Algorithm 3 of Cheng and Church. + * + * Try to re-add rows or columns that decrease the overall score. + * + * Also try adding inverted rows. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + while(true) { + // We need this to be final + mutable + final boolean[] added = new boolean[] { false, false }; + + // Step 2: add columns + cand.visitRow(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selcol); + if(cand.computeColResidue(mat, col) <= cand.residue) { + cand.selectColumn(col, true); + added[0] = true; + } + return false; + } + }); + + // Step 3: recompute values + if(added[0]) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + } + + // Step 4: try adding rows. + cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selrow); + if(cand.computeRowResidue(mat, row, false) <= cand.residue) { + cand.selectRow(row, true); + added[1] = true; + } + return false; + } + }); + + // Step 5: try adding inverted rows. + if(useinverted) { + cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selrow); + if(cand.computeRowResidue(mat, row, true) <= cand.residue) { + cand.selectRow(row, true); + cand.invertRow(row, true); + added[1] = true; + } + return false; + } + }); + } + if(added[1]) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + } + if(!added[0] && !added[1]) { + break; + } + } + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + /** + * Parameter to specify the distribution of replacement values when masking + * a cluster. + */ + public static final OptionID DIST_ID = new OptionID("chengandchurch.replacement", "Distribution of replacement values when masking found clusters."); + + /** + * Threshold value to determine the maximal acceptable score (mean squared + * residue) of a bicluster. + * <p/> + * Key: {@code -chengandchurch.delta} + * </p> + */ + public static final OptionID DELTA_ID = new OptionID("chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster."); + + /** + * Parameter for multiple node deletion to accelerate the algorithm. (>= + * 1) + * <p/> + * Key: {@code -chengandchurch.alpha} + * </p> + */ + public static final OptionID ALPHA_ID = new OptionID("chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm."); + + /** + * Number of biclusters to be found. + * <p/> + * Default value: 1 + * </p> + * <p/> + * Key: {@code -chengandchurch.n} + * </p> + */ + public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found."); + + /** + * Threshold for the score ({@link #DELTA_ID}). + */ + private double delta; + + /** + * The parameter for multiple node deletion.</p> + * <p> + * It is used to magnify the {@link #delta} value in the + * {@link ChengAndChurch#multipleNodeDeletion} method. + * </p> + */ + private double alpha; + + /** + * Number of biclusters to be found. + */ + private int n; + + /** + * Distribution of replacement values. + */ + private Distribution dist; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + DoubleParameter deltaP = new DoubleParameter(DELTA_ID); + if(config.grab(deltaP)) { + delta = deltaP.doubleValue(); + } + deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + + IntParameter nP = new IntParameter(N_ID, 1); + nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(nP)) { + n = nP.intValue(); + } + + DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.); + alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE); + if(config.grab(alphaP)) { + alpha = alphaP.doubleValue(); + } + + ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class); + if(config.grab(distP)) { + dist = distP.instantiateClass(config); + } + } + + @Override + protected ChengAndChurch<V> makeInstance() { + return new ChengAndChurch<>(delta, alpha, n, dist); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java new file mode 100644 index 00000000..21363bfc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java @@ -0,0 +1,28 @@ +/** + * <p>Biclustering algorithms.</p> + * + * + */ +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2013 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index 0d82add9..8e5fa627 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -74,7 +74,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; @@ -838,22 +838,22 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); } IntParameter maxlevelP = new IntParameter(MAXLEVEL_ID); - maxlevelP.addConstraint(new GreaterConstraint(0)); + maxlevelP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(maxlevelP)) { maxlevel = maxlevelP.getValue(); } IntParameter mindimP = new IntParameter(MINDIM_ID, 1); - mindimP.addConstraint(new GreaterConstraint(0)); + mindimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(mindimP)) { mindim = mindimP.getValue(); } DoubleParameter jitterP = new DoubleParameter(JITTER_ID); - jitterP.addConstraint(new GreaterConstraint(0)); + jitterP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if (config.grab(jitterP)) { jitter = jitterP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 9a4b8512..68878aef 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -29,7 +29,7 @@ import java.util.Map; import java.util.Map.Entry; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; @@ -270,7 +270,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) { ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters); ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query); - reconfig.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist); + reconfig.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist); ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm); reconfig.failOnErrors(); return instance; @@ -335,7 +335,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class); if(config.grab(algP)) { ListParameterization predefined = new ListParameterization(); - predefined.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI); + predefined.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI); TrackParameters trackpar = new TrackParameters(config); ChainedParameterization chain = new ChainedParameterization(predefined, trackpar); chain.errorsTo(config); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java index d1b714bf..79ddc16e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java @@ -36,9 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -162,33 +160,34 @@ public class HiCO<V extends NumberVector<?>> extends OPTICS<V, PCACorrelationDis super.makeOptions(config);
IntParameter muP = new IntParameter(MU_ID);
- muP.addConstraint(new GreaterConstraint(0));
- if (config.grab(muP)) {
+ muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(muP)) {
mu = muP.getValue();
}
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
kP.setOptional(true);
final int k;
- if (config.grab(kP)) {
+ if(config.grab(kP)) {
k = kP.getValue();
- } else {
+ }
+ else {
k = mu;
}
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, DEFAULT_DELTA);
- deltaP.addConstraint(new GreaterEqualConstraint(0));
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
double delta = DEFAULT_DELTA;
- if (config.grab(deltaP)) {
+ if(config.grab(deltaP)) {
delta = deltaP.doubleValue();
}
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0)); - alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = DEFAULT_ALPHA;
- if (config.grab(alphaP)) {
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java index f9531be0..99144b42 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -116,7 +116,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { * Number of sampling rounds to find a good split */ private final int samplingLevel; - + /** * Random factory */ @@ -163,34 +163,34 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null; IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null; ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs()); - Random r = rnd.getRandom(); + Random r = rnd.getSingleThreadedRandom(); final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation)); int cnum = 0; - while (unclustered.size() > minsize) { + while(unclustered.size() > minsize) { DBIDs current = unclustered; int lmDim = 1; - for (int k = 1; k <= maxdim; k++) { + for(int k = 1; k <= maxdim; k++) { // Implementation note: this while loop is from the original publication // and the published LMCLUS source code. It doesn't make sense to me - // it is lacking a stop criterion other than "cluster is too small" and // "cluster is inseparable"! Additionally, there is good criterion for // stopping at the appropriate dimensionality either. - while (true) { + while(true) { Separation separation = findSeparation(relation, current, k, r); // logger.verbose("k: " + k + " goodness: " + separation.goodness + // " threshold: " + separation.threshold); - if (separation.goodness <= sensitivityThreshold) { + if(separation.goodness <= sensitivityThreshold) { break; } ModifiableDBIDs subset = DBIDUtil.newArray(current.size()); - for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) { - if (deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { + for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) { + if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { subset.add(iter); } } // logger.verbose("size:"+subset.size()); - if (subset.size() < minsize) { + if(subset.size() < minsize) { break; } current = subset; @@ -199,7 +199,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } } // No more clusters found - if (current.size() < minsize || current == unclustered) { + if(current.size() < minsize || current == unclustered) { break; } // New cluster found @@ -210,22 +210,22 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { ret.addToplevelCluster(cluster); // Remove from main working set. unclustered.removeDBIDs(current); - if (progress != null) { + if(progress != null) { progress.setProcessed(relation.size() - unclustered.size(), LOG); } - if (cprogress != null) { + if(cprogress != null) { cprogress.setProcessed(cnum, LOG); } } // Remaining objects are noise - if (unclustered.size() > 0) { + if(unclustered.size() > 0) { ret.addToplevelCluster(new Cluster<>(unclustered, true)); } - if (progress != null) { + if(progress != null) { progress.setProcessed(relation.size(), LOG); progress.ensureCompleted(LOG); } - if (cprogress != null) { + if(cprogress != null) { cprogress.setCompleted(LOG); } return ret; @@ -272,7 +272,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size()); // System.out.println("Number of samples: " + samples); int remaining_retries = 100; - for (int i = 1; i <= samples; i++) { + for(int i = 1; i <= samples; i++) { DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong()); final DBIDIter iter = sample.iter(); // Use first as origin @@ -282,17 +282,17 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { Matrix basis; { List<Vector> vectors = new ArrayList<>(sample.size() - 1); - for (; iter.valid(); iter.advance()) { + for(; iter.valid(); iter.advance()) { Vector vec = relation.get(iter).getColumnVector(); vectors.add(vec.minusEquals(originV)); } // generate orthogonal basis basis = generateOrthonormalBasis(vectors); - if (basis == null) { + if(basis == null) { // new sample has to be taken. i--; remaining_retries--; - if (remaining_retries < 0) { + if(remaining_retries < 0) { throw new AbortException("Too many retries in sampling, and always a linear dependant data set."); } continue; @@ -301,9 +301,9 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { // Generate and fill a histogram. DoubleDynamicHistogram histogram = new DoubleDynamicHistogram(BINS); double w = 1.0 / currentids.size(); - for (DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) { // Skip sampled points - if (sample.contains(iter2)) { + if(sample.contains(iter2)) { continue; } Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV); @@ -311,7 +311,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { histogram.increment(distance, w); } double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold - if (th[1] > separation.goodness) { + if(th[1] > separation.goodness) { separation.goodness = th[1]; separation.threshold = th[0]; separation.originV = originV; @@ -341,16 +341,16 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { first = first.times(1.0 / first.euclideanLength()); Matrix ret = new Matrix(first.getDimensionality(), vectors.size()); ret.setCol(0, first); - for (int i = 1; i < vectors.size(); i++) { + for(int i = 1; i < vectors.size(); i++) { // System.out.println("Matrix:" + ret); Vector v_i = vectors.get(i); Vector u_i = v_i.copy(); // System.out.println("Vector " + i + ":" + partialSol); - for (int j = 0; j < i; j++) { + for(int j = 0; j < i; j++) { Vector v_j = ret.getCol(j); double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j); - if (Double.isNaN(f)) { - if (LOG.isDebuggingFine()) { + if(Double.isNaN(f)) { + if(LOG.isDebuggingFine()) { LOG.debugFine("Zero vector encountered? " + v_j); } return null; @@ -359,8 +359,8 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } // check if the vectors weren't independent final double len_u_i = u_i.euclideanLength(); - if (len_u_i == 0.0) { - if (LOG.isDebuggingFine()) { + if(len_u_i == 0.0) { + if(LOG.isDebuggingFine()) { LOG.debugFine("Points not independent - no orthonormalization."); } return null; @@ -391,7 +391,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { { MeanVariance mv = new MeanVariance(); DoubleHistogram.Iter forward = histogram.iter(); - for (int i = 0; forward.valid(); i++, forward.advance()) { + for(int i = 0; forward.valid(); i++, forward.advance()) { p1[i] = forward.getValue() + ((i > 0) ? p1[i - 1] : 0); mv.put(i, forward.getValue()); mu1[i] = mv.getMean(); @@ -404,7 +404,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { DoubleHistogram.Iter backwards = histogram.iter(); backwards.seek(histogram.getNumBins() - 1); // Seek to last - for (int j = n - 1; backwards.valid(); j--, backwards.retract()) { + for(int j = n - 1; backwards.valid(); j--, backwards.retract()) { p2[j] = backwards.getValue() + ((j + 1 < n) ? p2[j + 1] : 0); mv.put(j, backwards.getValue()); mu2[j] = mv.getMean(); @@ -412,7 +412,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } } - for (int i = 0; i < n; i++) { + for(int i = 0; i < n; i++) { jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i]))); } @@ -420,23 +420,23 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { double bestgoodness = Double.NEGATIVE_INFINITY; double devPrev = jt[1] - jt[0]; - for (int i = 1; i < jt.length - 1; i++) { + for(int i = 1; i < jt.length - 1; i++) { double devCur = jt[i + 1] - jt[i]; // System.out.println(p1[i]); // System.out.println(jt[i + 1]); // System.out.println(jt[i]); // System.out.println(devCur); // Local minimum found - calculate depth - if (devCur >= 0 && devPrev <= 0) { + if(devCur >= 0 && devPrev <= 0) { double lowestMaxima = Double.POSITIVE_INFINITY; - for (int j = i - 1; j > 0; j--) { - if (jt[j - 1] < jt[j]) { + for(int j = i - 1; j > 0; j--) { + if(jt[j - 1] < jt[j]) { lowestMaxima = Math.min(lowestMaxima, jt[j]); break; } } - for (int j = i + 1; j < n - 2; j++) { - if (jt[j + 1] < jt[j]) { + for(int j = i + 1; j < n - 2; j++) { + if(jt[j + 1] < jt[j]) { lowestMaxima = Math.min(lowestMaxima, jt[j]); break; } @@ -445,11 +445,11 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { final double mud = mu1[i] - mu2[i]; double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]); - if (Double.isNaN(discriminability)) { + if(Double.isNaN(discriminability)) { discriminability = -1; } double goodness = localDepth * discriminability; - if (goodness > bestgoodness) { + if(goodness > bestgoodness) { bestgoodness = goodness; bestpos = i; } @@ -552,7 +552,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { * Threshold */ private double threshold; - + /** * Random generator */ @@ -562,26 +562,26 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter maxLMDimP = new IntParameter(MAXDIM_ID); - maxLMDimP.addConstraint(new GreaterEqualConstraint(1)); + maxLMDimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); maxLMDimP.setOptional(true); - if (config.grab(maxLMDimP)) { + if(config.grab(maxLMDimP)) { maxdim = maxLMDimP.getValue(); } IntParameter minsizeP = new IntParameter(MINSIZE_ID); - minsizeP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(minsizeP)) { + minsizeP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minsizeP)) { minsize = minsizeP.getValue(); } IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100); - if (config.grab(samplingLevelP)) { + if(config.grab(samplingLevelP)) { samplingLevel = samplingLevelP.getValue(); } DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID); - if (config.grab(sensivityThresholdP)) { + if(config.grab(sensivityThresholdP)) { threshold = sensivityThresholdP.getValue(); } RandomParameter rndP = new RandomParameter(RANDOM_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index a9c67a58..7733ddaa 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -61,8 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -135,7 +134,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // current dimensionality associated with each seed int dim_c = RelationUtil.dimensionality(relation); - if (dim_c < l) { + if(dim_c < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + dim_c + " < " + l + ")"); } @@ -149,8 +148,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null; - while (k_c > k) { - if (cprogress != null) { + while(k_c > k) { + if(cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } @@ -158,8 +157,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri assign(relation, distFunc, clusters); // determine current subspace associated with each cluster - for (ORCLUSCluster cluster : clusters) { - if (cluster.objectIDs.size() > 0) { + for(ORCLUSCluster cluster : clusters) { + if(cluster.objectIDs.size() > 0) { cluster.basis = findBasis(relation, distFunc, cluster, dim_c); } } @@ -172,18 +171,19 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } assign(relation, distFunc, clusters); - if (cprogress != null) { + if(cprogress != null) { cprogress.setProcessed(clusters.size()); cprogress.setCompleted(LOG); } // get the result Clustering<Model> r = new Clustering<>("ORCLUS clustering", "orclus-clustering"); - for (ORCLUSCluster c : clusters) { + for(ORCLUSCluster c : clusters) { r.addToplevelCluster(new Cluster<Model>(c.objectIDs, ClusterModel.CLUSTER)); } return r; - } catch (Exception e) { + } + catch(Exception e) { throw new IllegalStateException(e); } } @@ -199,7 +199,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, rnd); NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database); List<ORCLUSCluster> seeds = new ArrayList<>(); - for (DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { seeds.add(new ORCLUSCluster(database.get(iter), iter, factory)); } return seeds; @@ -217,29 +217,29 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri private void assign(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters) { NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database); // clear the current clusters - for (ORCLUSCluster cluster : clusters) { + for(ORCLUSCluster cluster : clusters) { cluster.objectIDs.clear(); } // projected centroids of the clusters List<V> projectedCentroids = new ArrayList<>(clusters.size()); - for (ORCLUSCluster c : clusters) { + for(ORCLUSCluster c : clusters) { projectedCentroids.add(projection(c, c.centroid, factory)); } // for each data point o do - for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { V o = database.get(it); DoubleDistance minDist = null; ORCLUSCluster minCluster = null; // determine projected distance between o and cluster - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < clusters.size(); i++) { ORCLUSCluster c = clusters.get(i); V o_proj = projection(c, o, factory); DoubleDistance dist = distFunc.distance(o_proj, projectedCentroids.get(i)); - if (minDist == null || minDist.compareTo(dist) > 0) { + if(minDist == null || minDist.compareTo(dist) > 0) { minDist = dist; minCluster = c; } @@ -250,8 +250,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } // recompute the seed in each clusters - for (ORCLUSCluster cluster : clusters) { - if (cluster.objectIDs.size() > 0) { + for(ORCLUSCluster cluster : clusters) { + if(cluster.objectIDs.size() > 0) { cluster.centroid = Centroid.make(database, cluster.objectIDs).toVector(database); } } @@ -271,7 +271,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // covariance matrix of cluster // Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs); GenericDistanceDBIDList<DoubleDistance> results = new GenericDistanceDBIDList<>(cluster.objectIDs.size()); - for (DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { + for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it)); results.add(distance, it); } @@ -304,9 +304,9 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri */ private void merge(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters, int k_new, int d_new, IndefiniteProgress cprogress) { ArrayList<ProjectedEnergy> projectedEnergies = new ArrayList<>(); - for (int i = 0; i < clusters.size(); i++) { - for (int j = 0; j < clusters.size(); j++) { - if (i >= j) { + for(int i = 0; i < clusters.size(); i++) { + for(int j = 0; j < clusters.size(); j++) { + if(i >= j) { continue; } // projected energy of c_ij in subspace e_ij @@ -318,8 +318,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } } - while (clusters.size() > k_new) { - if (cprogress != null) { + while(clusters.size() > k_new) { + if(cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } // find the smallest value of r_ij @@ -327,12 +327,12 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // renumber the clusters by replacing cluster c_i with cluster c_ij // and discarding cluster c_j - for (int c = 0; c < clusters.size(); c++) { - if (c == minPE.i) { + for(int c = 0; c < clusters.size(); c++) { + if(c == minPE.i) { clusters.remove(c); clusters.add(c, minPE.cluster); } - if (c == minPE.j) { + if(c == minPE.j) { clusters.remove(c); } } @@ -341,15 +341,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri int i = minPE.i; int j = minPE.j; Iterator<ProjectedEnergy> it = projectedEnergies.iterator(); - while (it.hasNext()) { + while(it.hasNext()) { ProjectedEnergy pe = it.next(); - if (pe.i == i || pe.i == j || pe.j == i || pe.j == j) { + if(pe.i == i || pe.i == j || pe.j == i || pe.j == j) { it.remove(); - } else { - if (pe.i > j) { + } + else { + if(pe.i > j) { pe.i -= 1; } - if (pe.j > j) { + if(pe.j > j) { pe.j -= 1; } } @@ -357,10 +358,11 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // ... and recompute them ORCLUSCluster c_ij = minPE.cluster; - for (int c = 0; c < clusters.size(); c++) { - if (c < i) { + for(int c = 0; c < clusters.size(); c++) { + if(c < i) { projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, c, i, d_new)); - } else if (c > i) { + } + else if(c > i) { projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, i, c, d_new)); } } @@ -389,7 +391,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri double sum = 0.; V c_proj = projection(c_ij, c_ij.centroid, factory); - for (DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { V o_proj = projection(c_ij, database.get(iter), factory); double dist = distFunc.distance(o_proj, c_proj).doubleValue(); sum += dist * dist; @@ -417,15 +419,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri // convert into array. c.objectIDs = DBIDUtil.newArray(c.objectIDs); - if (c.objectIDs.size() > 0) { + if(c.objectIDs.size() > 0) { c.centroid = Centroid.make(relation, c.objectIDs).toVector(relation); c.basis = findBasis(relation, distFunc, c, dim); - } else { + } + else { NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5); c.centroid = factory.newNumberVector(cent.getArrayRef()); double[][] doubles = new double[c1.basis.getRowDimensionality()][dim]; - for (int i = 0; i < dim; i++) { + for(int i = 0; i < dim; i++) { doubles[i][i] = 1; } c.basis = new Matrix(doubles); @@ -590,16 +593,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri protected void configAlpha(Parameterization config) { DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5); - alphaP.addConstraint(new GreaterConstraint(0)); - alphaP.addConstraint(new LessEqualConstraint(1)); - if (config.grab(alphaP)) { + alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + alphaP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); + if(config.grab(alphaP)) { alpha = alphaP.doubleValue(); } } protected void configSeed(Parameterization config) { RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java index 545a8171..1b316c7c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java @@ -23,7 +23,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -67,12 +68,12 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh /** * Range to query with */ - D epsilon; + protected D epsilon; /** * Distance function to use */ - DistanceFunction<O, D> distFunc; + protected DistanceFunction<O, D> distFunc; /** * Full constructor. @@ -177,14 +178,14 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh protected void makeOptions(Parameterization config) { super.makeOptions(config); // Get a distance function. - ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); D distanceFactory = null; if(config.grab(distanceP)) { distfun = distanceP.instantiateClass(config); distanceFactory = distfun.getDistanceFactory(); } // Get the epsilon parameter - DistanceParameter<D> epsilonP = new DistanceParameter<>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory); + DistanceParameter<D> epsilonP = new DistanceParameter<>(DBSCAN.Parameterizer.EPSILON_ID, distanceFactory); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java index a6e62e2e..ac7ba81d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java @@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; @@ -54,7 +55,7 @@ public class MinPtsCorePredicate implements CorePredicate { /** * The minpts parameter. */ - int minpts; + protected int minpts; /** * Default constructor. @@ -127,7 +128,7 @@ public class MinPtsCorePredicate implements CorePredicate { protected void makeOptions(Parameterization config) { super.makeOptions(config); // Get the minpts parameter - IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID); + IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java index ac5cb77c..f6dbc88f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java @@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; @@ -178,9 +178,10 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DataStore<D> lambda = pointerresult.getParentDistanceStore(); Clustering<DendrogramModel<D>> result; - if (lambda instanceof DoubleDistanceDataStore) { + if(lambda instanceof DoubleDistanceDataStore) { result = extractClustersDouble(ids, pi, (DoubleDistanceDataStore) lambda); - } else { + } + else { result = extractClusters(ids, pi, lambda); } result.addChildResult(pointerresult); @@ -208,28 +209,31 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDArrayIter it = order.iter(); // Used multiple times! int split; - if (minclusters > 0) { + if(minclusters > 0) { split = Math.max(ids.size() - minclusters, 0); // Stop distance: final D stopdist = lambda.get(order.get(split)); // Tie handling: decrement split. - while (split > 0) { + while(split > 0) { it.seek(split - 1); - if (stopdist.compareTo(lambda.get(it)) <= 0) { + if(stopdist.compareTo(lambda.get(it)) <= 0) { split--; - } else { + } + else { break; } } - } else if (threshold != null) { + } + else if(threshold != null) { split = ids.size(); it.seek(split - 1); - while (threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) { + while(threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) { split--; it.retract(); } - } else { // full hierarchy + } + else { // full hierarchy split = 0; } @@ -242,19 +246,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. // Go backwards on the lower part. - for (it.seek(split - 1); it.valid(); it.retract()) { + for(it.seek(split - 1); it.valid(); it.retract()) { D dist = lambda.get(it); // Distance to successor pi.assignVar(it, succ); // succ = pi(it) int clusterid = cluster_map.intValue(succ); // Successor cluster has already been created: - if (clusterid >= 0) { + if(clusterid >= 0) { cluster_dbids.get(clusterid).add(it); cluster_map.putInt(it, clusterid); // Update distance to maximum encountered: - if (cluster_dist.get(clusterid).compareTo(dist) < 0) { + if(cluster_dist.get(clusterid).compareTo(dist) < 0) { cluster_dist.set(clusterid, dist); } - } else { + } + else { // Need to start a new cluster: clusterid = cluster_dbids.size(); // next cluster number. ModifiableDBIDs cids = DBIDUtil.newArray(); @@ -270,12 +275,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } final Clustering<DendrogramModel<D>> dendrogram; - switch(outputmode) { + switch(outputmode){ case PARTIAL_HIERARCHY: { // Build a hierarchy out of these clusters. dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); @@ -284,74 +289,81 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); } cluster_dist = null; // Invalidate cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); // The current cluster led by the current element: final Cluster<DendrogramModel<D>> clus; - if (clusterid >= 0) { + if(clusterid >= 0) { clus = clusters.get(clusterid); - } else if (!singletons && ids.size() != 1) { + } + else if(!singletons && ids.size() != 1) { clus = null; - } else { + } + else { clus = makeCluster(it, null, DBIDUtil.deref(it)); } // The successor to join: pi.assignVar(it, succ); // succ = pi(it) - if (DBIDUtil.equal(it, succ)) { + if(DBIDUtil.equal(it, succ)) { assert (root == null); root = clus; - } else { + } + else { // Parent cluster: int parentid = cluster_map.intValue(succ); D depth = lambda.get(it); // Parent cluster exists - merge as a new cluster: - if (parentid >= 0) { + if(parentid >= 0) { final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); - if (pclus.getModel().getDistance().equals(depth)) { - if (clus == null) { + if(pclus.getModel().getDistance().equals(depth)) { + if(clus == null) { ((ModifiableDBIDs) pclus.getIDs()).add(it); - } else { + } + else { dendrogram.addChildCluster(pclus, clus); } - } else { + } + else { // Merge at new depth: ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); - if (clus == null) { + if(clus == null) { cids.add(it); } Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(npclus, clus); } dendrogram.addChildCluster(npclus, pclus); // Replace existing parent cluster: new depth clusters.set(parentid, npclus); } - } else { + } + else { // Merge with parent at this depth: final Cluster<DendrogramModel<D>> pclus; - if (!singletons) { + if(!singletons) { ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); cids.add(succ); - if (clus == null) { + if(clus == null) { cids.add(it); } // New cluster for parent and/or new point pclus = makeCluster(succ, depth, cids); - } else { + } + else { // Create a new, one-element cluster for parent, and a merged // cluster on top. pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); } - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(pclus, clus); } // Store cluster: @@ -362,7 +374,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -377,21 +389,21 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { dendrogram.addToplevelCluster(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); } cluster_dist = null; // Invalidate cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); - if (clusterid < 0) { + if(clusterid < 0) { dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -401,7 +413,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement throw new AbortException("Unsupported output mode."); } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } @@ -428,29 +440,32 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDArrayIter it = order.iter(); // Used multiple times! int split; - if (minclusters > 0) { + if(minclusters > 0) { split = Math.max(ids.size() - minclusters, 0); // Stop distance: final double stopdist = lambda.doubleValue(order.get(split)); // Tie handling: decrement split. - while (split > 0) { + while(split > 0) { it.seek(split - 1); - if (stopdist <= lambda.doubleValue(it)) { + if(stopdist <= lambda.doubleValue(it)) { split--; - } else { + } + else { break; } } - } else if (threshold != null) { + } + else if(threshold != null) { split = ids.size(); it.seek(split - 1); double stopdist = ((DoubleDistance) threshold).doubleValue(); - while (stopdist <= lambda.doubleValue(it) && it.valid()) { + while(stopdist <= lambda.doubleValue(it) && it.valid()) { split--; it.retract(); } - } else { // full hierarchy + } + else { // full hierarchy split = 0; } @@ -463,19 +478,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. // Go backwards on the lower part. - for (it.seek(split - 1); it.valid(); it.retract()) { + for(it.seek(split - 1); it.valid(); it.retract()) { double dist = lambda.doubleValue(it); // Distance to successor pi.assignVar(it, succ); // succ = pi(it) int clusterid = cluster_map.intValue(succ); // Successor cluster has already been created: - if (clusterid >= 0) { + if(clusterid >= 0) { cluster_dbids.get(clusterid).add(it); cluster_map.putInt(it, clusterid); // Update distance to maximum encountered: - if (cluster_dist.get(clusterid) < dist) { + if(cluster_dist.get(clusterid) < dist) { cluster_dist.set(clusterid, dist); } - } else { + } + else { // Need to start a new cluster: clusterid = cluster_dbids.size(); // next cluster number. ModifiableDBIDs cids = DBIDUtil.newArray(); @@ -491,12 +507,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } final Clustering<DendrogramModel<D>> dendrogram; - switch(outputmode) { + switch(outputmode){ case PARTIAL_HIERARCHY: { // Build a hierarchy out of these clusters. dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); @@ -505,7 +521,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { @SuppressWarnings("unchecked") D depth = (D) new DoubleDistance(cluster_dist.get(i)); clusters.add(makeCluster(it2, depth, cluster_dbids.get(i))); @@ -514,68 +530,75 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); // The current cluster led by the current element: final Cluster<DendrogramModel<D>> clus; - if (clusterid >= 0) { + if(clusterid >= 0) { clus = clusters.get(clusterid); - } else if (!singletons && ids.size() != 1) { + } + else if(!singletons && ids.size() != 1) { clus = null; - } else { + } + else { clus = makeCluster(it, null, DBIDUtil.deref(it)); } // The successor to join: pi.assignVar(it, succ); // succ = pi(it) - if (DBIDUtil.equal(it, succ)) { + if(DBIDUtil.equal(it, succ)) { assert (root == null); root = clus; - } else { + } + else { // Parent cluster: int parentid = cluster_map.intValue(succ); @SuppressWarnings("unchecked") D depth = (D) new DoubleDistance(lambda.doubleValue(it)); // Parent cluster exists - merge as a new cluster: - if (parentid >= 0) { + if(parentid >= 0) { final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); - if (pclus.getModel().getDistance().equals(depth)) { - if (clus == null) { + if(pclus.getModel().getDistance().equals(depth)) { + if(clus == null) { ((ModifiableDBIDs) pclus.getIDs()).add(it); - } else { + } + else { dendrogram.addChildCluster(pclus, clus); } - } else { + } + else { // Merge at new depth: ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); - if (clus == null) { + if(clus == null) { cids.add(it); } Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(npclus, clus); } dendrogram.addChildCluster(npclus, pclus); // Replace existing parent cluster: new depth clusters.set(parentid, npclus); } - } else { + } + else { // Merge with parent at this depth: final Cluster<DendrogramModel<D>> pclus; - if (!singletons) { + if(!singletons) { ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); cids.add(succ); - if (clus == null) { + if(clus == null) { cids.add(it); } // New cluster for parent and/or new point pclus = makeCluster(succ, depth, cids); - } else { + } + else { // Create a new, one-element cluster for parent, and a merged // cluster on top. pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); } - if (clus != null) { + if(clus != null) { dendrogram.addChildCluster(pclus, clus); } // Store cluster: @@ -586,7 +609,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -601,7 +624,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Convert initial clusters to cluster objects { int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { @SuppressWarnings("unchecked") D depth = (D) new DoubleDistance(cluster_dist.get(i)); dendrogram.addToplevelCluster(makeCluster(it2, depth, cluster_dbids.get(i))); @@ -610,14 +633,14 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement cluster_dbids = null; // Invalidate } // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { + for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); - if (clusterid < 0) { + if(clusterid < 0) { dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); } // Decrement counter - if (progress != null) { + if(progress != null) { progress.incrementProcessed(LOG); } } @@ -627,7 +650,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement throw new AbortException("Unsupported output mode."); } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } @@ -644,13 +667,16 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement */ private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members) { final String name; - if (members.size() == 0) { + if(members.size() == 0) { name = "mrg_" + DBIDUtil.toString(lead) + "_" + depth; - } else if (depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) { + } + else if(depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) { name = "obj_" + DBIDUtil.toString(lead); - } else if (depth != null) { + } + else if(depth != null) { name = "clu_" + DBIDUtil.toString(lead) + "_" + depth; - } else { + } + else { // Complete data set only? name = "clu_" + DBIDUtil.toString(lead); } @@ -794,53 +820,54 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter<HierarchicalClusteringAlgorithm<D>> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class); - if (config.grab(algorithmP)) { + if(config.grab(algorithmP)) { algorithm = algorithmP.instantiateClass(config); } EnumParameter<ThresholdMode> modeP = new EnumParameter<>(MODE_ID, ThresholdMode.class, ThresholdMode.BY_MINCLUSTERS); - if (config.grab(modeP)) { + if(config.grab(modeP)) { thresholdmode = modeP.getValue(); } - if (thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) { + if(thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) { IntParameter minclustersP = new IntParameter(MINCLUSTERS_ID); - minclustersP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(minclustersP)) { + minclustersP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(minclustersP)) { minclusters = minclustersP.intValue(); } } - if (thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) { + if(thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) { // Fallback to double when no algorithm chosen yet: @SuppressWarnings("unchecked") final D factory = algorithm != null ? algorithm.getDistanceFactory() : (D) DoubleDistance.FACTORY; DistanceParameter<D> distP = new DistanceParameter<>(THRESHOLD_ID, factory); - if (config.grab(distP)) { + if(config.grab(distP)) { threshold = distP.getValue(); } } - if (thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) { + if(thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) { EnumParameter<OutputMode> outputP = new EnumParameter<>(OUTPUTMODE_ID, OutputMode.class); - if (config.grab(outputP)) { + if(config.grab(outputP)) { outputmode = outputP.getValue(); } - } else { + } + else { // This becomes full hierarchy: minclusters = -1; outputmode = OutputMode.PARTIAL_HIERARCHY; } Flag singletonsF = new Flag(SINGLETONS_ID); - if (config.grab(singletonsF)) { + if(config.grab(singletonsF)) { singletons = singletonsF.isTrue(); } } @Override protected ExtractFlatClusteringFromHierarchy<D> makeInstance() { - switch(thresholdmode) { + switch(thresholdmode){ case NO_THRESHOLD: case BY_MINCLUSTERS: return new ExtractFlatClusteringFromHierarchy<>(algorithm, minclusters, outputmode, singletons); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index dc1fa47c..5754e961 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.MeanModel; import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; @@ -49,8 +50,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -105,68 +105,61 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param relation the database to cluster * @param means a list of k means * @param clusters cluster assignment + * @param assignment Current cluster assignment * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) { + protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { boolean changed = false; - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if (dist < mindist) { + if(dist < mindist) { minIndex = i; mindist = dist; } } - if (clusters.get(minIndex).add(iditer)) { - changed = true; - // Remove from previous cluster - // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { - break; - } - } - } - } + changed |= updateAssignment(iditer, clusters, assignment, minIndex); } - } else { + } + else { final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if (dist.compareTo(mindist) < 0) { + if(dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } - if (clusters.get(minIndex).add(iditer)) { - changed = true; - // Remove from previous cluster - // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { - break; - } - } - } - } + changed |= updateAssignment(iditer, clusters, assignment, minIndex); } } return changed; } + protected boolean updateAssignment(DBIDIter iditer, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, int newA) { + final int oldA = assignment.intValue(iditer); + if(oldA == newA) { + return false; + } + clusters.get(newA).add(iditer); + assignment.putInt(iditer, newA); + if(oldA >= 0) { + clusters.get(oldA).remove(iditer); + } + return true; + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(new CombinedTypeInformation(TypeUtil.NUMBER_VECTOR_FIELD, getDistanceFunction().getInputTypeRestriction())); @@ -181,24 +174,28 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @return the mean vectors of the given clusters in the given database */ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) { + // TODO: use Kahan summation for better numerical precision? List<Vector> newMeans = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - if (list.size() > 0) { - double s = 1.0 / list.size(); + if(list.size() > 0) { DBIDIter iter = list.iter(); - assert (iter.valid()); - mean = database.get(iter).getColumnVector().timesEquals(s); + // Initialize with first. + mean = database.get(iter).getColumnVector(); double[] raw = mean.getArrayRef(); iter.advance(); - for (; iter.valid(); iter.advance()) { + // Update with remaining instances + for(; iter.valid(); iter.advance()) { NumberVector<?> vec = database.get(iter); - for (int j = 0; j < mean.getDimensionality(); j++) { - raw[j] += s * vec.doubleValue(j); + for(int j = 0; j < mean.getDimensionality(); j++) { + raw[j] += vec.doubleValue(j); } } - } else { + mean.timesEquals(1.0 / list.size()); + } + else { + // Keep degenerated means as-is for now. mean = means.get(i).getColumnVector(); } newMeans.add(mean); @@ -218,17 +215,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan final int dim = medians.get(0).getDimensionality(); final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); List<NumberVector<?>> newMedians = new ArrayList<>(k); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); - if (list.size() > 0) { + if(list.size() > 0) { Vector mean = new Vector(dim); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { sorter.setDimension(d); DBID id = QuickSelect.median(list, sorter); mean.set(d, database.get(id).doubleValue(d)); } newMedians.add(mean); - } else { + } + else { newMedians.add((NumberVector<?>) medians.get(i)); } } @@ -244,14 +242,11 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param op Cluster size change / Weight change */ protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) { - if (newsize == 0) { + if(newsize == 0) { return; // Keep old mean } - Vector delta = vec.getColumnVector(); - // Compute difference from mean - delta.minusEquals(mean); - delta.timesEquals(op / newsize); - mean.plusEquals(delta); + Vector delta = vec.getColumnVector().minusEquals(mean); + mean.plusTimesEquals(delta, op / newsize); } /** @@ -260,76 +255,84 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param relation Relation * @param means Means * @param clusters Clusters + * @param assignment Current cluster assignment * @return true when the means have changed */ - protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { + protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { boolean changed = false; - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); // Incremental update - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if (dist < mindist) { + if(dist < mindist) { minIndex = i; mindist = dist; } } - // Update the cluster mean incrementally: - for (int i = 0; i < k; i++) { - ModifiableDBIDs ci = clusters.get(i); - if (i == minIndex) { - if (ci.add(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size(), +1); - changed = true; - } - } else if (ci.remove(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); - changed = true; - } - } + changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); } - } else { + } + else { // Raw distance function final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); // Incremental update - for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if (dist.compareTo(mindist) < 0) { + if(dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } - // Update the cluster mean incrementally: - for (int i = 0; i < k; i++) { - ModifiableDBIDs ci = clusters.get(i); - if (i == minIndex) { - if (ci.add(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size(), +1); - changed = true; - } - } else if (ci.remove(iditer)) { - incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); - changed = true; - } - } + changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment); } } return changed; } + /** + * Try to update the cluster assignment. + * + * @param clusters Current clusters + * @param means Means to update + * @param minIndex Cluster to assign to + * @param fv Vector + * @param iditer Object ID + * @param assignment Current cluster assignment + * @return {@code true} when assignment changed + */ + private boolean updateMeanAndAssignment(List<ModifiableDBIDs> clusters, List<Vector> means, int minIndex, V fv, DBIDIter iditer, WritableIntegerDataStore assignment) { + int cur = assignment.intValue(iditer); + if(cur == minIndex) { + return false; + } + final ModifiableDBIDs curclus = clusters.get(minIndex); + curclus.add(iditer); + incrementalUpdateMean(means.get(minIndex), fv, curclus.size(), +1); + + if(cur >= 0) { + ModifiableDBIDs ci = clusters.get(cur); + ci.remove(iditer); + incrementalUpdateMean(means.get(cur), fv, ci.size() + 1, -1); + } + + assignment.putInt(iditer, minIndex); + return true; + } + @Override public void setK(int k) { this.k = k; @@ -366,27 +369,27 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan @Override protected void makeOptions(Parameterization config) { ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if (config.grab(distanceFunctionP)) { + if(config.grab(distanceFunctionP)) { distanceFunction = distanceFunctionP.instantiateClass(config); - if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); } } IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.getValue(); } ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(maxiterP)) { maxiter = maxiterP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java index 30bb640c..51e7ace9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java @@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -90,34 +90,35 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< @Override public Clustering<M> run(Database database, Relation<V> relation) { - if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { + if(!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass()); } final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction(); Clustering<M> bestResult = null; - if (trials > 1) { + if(trials > 1) { double bestCost = Double.POSITIVE_INFINITY; FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null; - for (int i = 0; i < trials; i++) { + for(int i = 0; i < trials; i++) { Clustering<M> currentCandidate = innerkMeans.run(database, relation); double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("Cost of candidate " + i + ": " + currentCost); } - if (currentCost < bestCost) { + if(currentCost < bestCost) { bestResult = currentCandidate; bestCost = currentCost; } - if (prog != null) { + if(prog != null) { prog.incrementProcessed(LOG); } } - if (prog != null) { + if(prog != null) { prog.ensureCompleted(LOG); } - } else { + } + else { bestResult = innerkMeans.run(database); } @@ -195,18 +196,18 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance< @Override protected void makeOptions(Parameterization config) { IntParameter trialsP = new IntParameter(TRIALS_ID); - trialsP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(trialsP)) { + trialsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(trialsP)) { trials = trialsP.intValue(); } ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); - if (config.grab(kMeansVariantP)) { + if(config.grab(kMeansVariantP)) { kMeansVariant = kMeansVariantP.instantiateClass(config); } ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class); - if (config.grab(qualityMeasureP)) { + if(config.grab(qualityMeasureP)) { qualityMeasure = qualityMeasureP.instantiateClass(config); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java index a018c04b..9edfd816 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.ArrayList; import java.util.List; -import java.util.Random; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.Database; @@ -74,7 +73,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten @Override public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { // Get a distance query - if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") @@ -84,26 +83,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean List<V> means = new ArrayList<>(k); - Random random = rnd.getRandom(); - DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter(); means.add(relation.get(first)); DBIDVar best = DBIDUtil.newVar(first); - for (int i = (dropfirst ? 0 : 1); i < k; i++) { + for(int i = (dropfirst ? 0 : 1); i < k; i++) { // Find farthest object: double maxdist = Double.NEGATIVE_INFINITY; - for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { double dsum = 0.; - for (V ex : means) { + for(V ex : means) { dsum += distQ.distance(ex, it).doubleValue(); } - if (dsum > maxdist) { + if(dsum > maxdist) { maxdist = dsum; best.set(it); } } // Add new mean: - if (k == 0) { + if(k == 0) { means.clear(); // Remove temporary first element. } means.add(relation.get(best)); @@ -114,7 +112,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten @Override public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { - if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") @@ -123,26 +121,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - Random random = rnd.getRandom(); - DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter(); means.add(first); DBIDVar best = DBIDUtil.newVar(first); - for (int i = (dropfirst ? 0 : 1); i < k; i++) { + for(int i = (dropfirst ? 0 : 1); i < k; i++) { // Find farthest object: double maxdist = Double.NEGATIVE_INFINITY; - for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { double dsum = 0.; - for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { + for(DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { dsum += distQ.distance(ex, it).doubleValue(); } - if (dsum > maxdist) { + if(dsum > maxdist) { maxdist = dsum; best.set(it); } } // Add new mean: - if (k == 0) { + if(k == 0) { means.clear(); // Remove temporary first element. } means.add(best); @@ -173,7 +170,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten protected void makeOptions(Parameterization config) { super.makeOptions(config); Flag dropfirstP = new Flag(DROPFIRST_ID); - if (config.grab(dropfirstP)) { + if(config.grab(dropfirstP)) { dropfirst = dropfirstP.isTrue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java new file mode 100644 index 00000000..aec4fe0f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java @@ -0,0 +1,346 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * Provides the k-means algorithm, using Lloyd-style bulk iterations. + * + * However, in contrast to Lloyd's k-means and similar to MacQueen, we do update + * the mean vectors multiple times, not only at the very end of the iteration. + * This should yield faster convergence at little extra cost. + * + * To avoid issues with ordered data, we use random sampling to obtain the data + * blocks. + * + * @author Erich Schubert + * + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansBatchedLloyd.class); + + /** + * Number of blocks to use. + */ + int blocks; + + /** + * Random used for partitioning. + */ + RandomFactory random; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Initialization method + * @param blocks Number of blocks + * @param random Random factory used for partitioning. + */ + public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer, int blocks, RandomFactory random) { + super(distanceFunction, k, maxiter, initializer); + this.blocks = blocks; + this.random = random; + } + + @Override + public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + final int dim = RelationUtil.dimensionality(relation); + // Choose initial means + List<? extends NumberVector<?>> mvs = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); + // Convert to (modifiable) math vectors. + List<Vector> means = new ArrayList<>(k); + for (NumberVector<?> m : mvs) { + means.add(m.getColumnVector()); + } + + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for (int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + + ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), blocks, random); + + double[][] meanshift = new double[k][dim]; + int[] changesize = new int[k]; + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if (prog != null) { + prog.incrementProcessed(LOG); + } + boolean changed = false; + FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null; + for (int p = 0; p < parts.length; p++) { + // Initialize new means scratch space. + for (int i = 0; i < k; i++) { + Arrays.fill(meanshift[i], 0.); + } + Arrays.fill(changesize, 0); + changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment); + // Recompute means. + updateMeans(means, meanshift, clusters, changesize); + if (pprog != null) { + pprog.incrementProcessed(LOG); + } + } + if (pprog != null) { + pprog.ensureCompleted(LOG); + } + // Stop if no cluster assignment changed. + if (!changed) { + break; + } + } + if (prog != null) { + prog.setCompleted(LOG); + } + + // Wrap result + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param relation the database to cluster + * @param ids IDs to process + * @param oldmeans a list of k means + * @param meanshift delta to apply to each mean + * @param changesize New cluster sizes + * @param clusters cluster assignment + * @param assignment Current cluster assignment + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector<?>> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) { + boolean changed = false; + + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); + for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY; + V fv = relation.get(iditer); + int minIndex = 0; + for (int i = 0; i < k; i++) { + double dist = df.doubleDistance(fv, oldmeans.get(i)); + if (dist < mindist) { + minIndex = i; + mindist = dist; + } + } + changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); + } + } else { + final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); + for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + D mindist = df.getDistanceFactory().infiniteDistance(); + V fv = relation.get(iditer); + int minIndex = 0; + for (int i = 0; i < k; i++) { + D dist = df.distance(fv, oldmeans.get(i)); + if (dist.compareTo(mindist) < 0) { + minIndex = i; + mindist = dist; + } + } + changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex); + } + } + return changed; + } + + /** + * Update the assignment of a single object. + * + * @param id Object to assign + * @param fv Vector + * @param clusters Clusters + * @param assignment Current cluster assignment + * @param meanshift Current shifting offset + * @param changesize Size change of the current cluster + * @param minIndex Index of best cluster. + * @return {@code true} when assignment changed. + */ + protected boolean updateAssignment(DBIDIter id, V fv, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[][] meanshift, int[] changesize, int minIndex) { + int cur = assignment.intValue(id); + if (cur == minIndex) { + return false; + } + // Add to new cluster. + { + clusters.get(minIndex).add(id); + changesize[minIndex]++; + double[] raw = meanshift[minIndex]; + for (int j = 0; j < fv.getDimensionality(); j++) { + raw[j] += fv.doubleValue(j); + } + } + // Remove from previous cluster + if (cur >= 0) { + clusters.get(cur).remove(id); + changesize[cur]--; + double[] raw = meanshift[cur]; + for (int j = 0; j < fv.getDimensionality(); j++) { + raw[j] -= fv.doubleValue(j); + } + } + assignment.putInt(id, minIndex); + return true; + } + + /** + * Merge changes into mean vectors. + * + * @param means Mean vectors + * @param meanshift Shift offset + * @param clusters + * @param changesize Size of change (for weighting!) + */ + protected void updateMeans(List<Vector> means, double[][] meanshift, List<ModifiableDBIDs> clusters, int[] changesize) { + for (int i = 0; i < k; i++) { + int newsize = clusters.get(i).size(), oldsize = newsize - changesize[i]; + if (newsize == 0) { + continue; // Keep previous mean vector. + } + if (oldsize == 0) { + means.set(i, new Vector(meanshift[i]).times(1. / newsize)); + continue; // Replace with new vector. + } + if (oldsize == newsize) { + means.get(i).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize); + continue; + } + means.get(i).timesEquals(oldsize / (double) newsize).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize); + } + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + /** + * Parameter for the number of blocks. + */ + public static final OptionID BLOCKS_ID = new OptionID("kmeans.blocks", "Number of blocks to use for processing. Means will be recomputed after each block."); + + /** + * Random source for blocking. + */ + public static final OptionID RANDOM_ID = new OptionID("kmeans.blocks.random", "Random source for producing blocks."); + + /** + * Number of blocks. + */ + int blocks; + + /** + * Random used for partitioning. + */ + RandomFactory random; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter blocksP = new IntParameter(BLOCKS_ID, 10); + blocksP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if (config.grab(blocksP)) { + blocks = blocksP.intValue(); + } + RandomParameter randomP = new RandomParameter(RANDOM_ID); + if (config.grab(randomP)) { + random = randomP.getValue(); + } + } + + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected KMeansBatchedLloyd<V, D> makeInstance() { + return new KMeansBatchedLloyd<>(distanceFunction, k, maxiter, initializer, blocks, random); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java index 37071d36..80a581b1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java @@ -41,7 +41,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -205,7 +205,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(new GreaterConstraint(1)); + kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if (config.grab(kP)) { k = kP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java new file mode 100644 index 00000000..2a60ef27 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java @@ -0,0 +1,155 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.KMeansModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; + +/** + * Provides the k-means algorithm, alternating between MacQueen-style + * incremental processing and Lloyd-Style batch steps. + * + * @author Erich Schubert + * + * @apiviz.landmark + * @apiviz.has KMeansModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansHybridLloydMacQueen.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Initialization method + */ + public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + @Override + public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { + if (relation.size() <= 0) { + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); + } + // Choose initial means + List<Vector> means = new ArrayList<>(k); + for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) { + means.add(nv.getColumnVector()); + } + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<>(); + for (int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); + } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); + + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; + for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) { + { // MacQueen + if (prog != null) { + prog.incrementProcessed(LOG); + } + boolean changed = macQueenIterate(relation, means, clusters, assignment); + if (!changed) { + break; + } + } + { // Lloyd + if (prog != null) { + prog.incrementProcessed(LOG); + } + boolean changed = assignToNearestCluster(relation, means, clusters, assignment); + // Stop if no cluster assignment changed. + if (!changed) { + break; + } + // Recompute means. + means = means(clusters, means, relation); + } + } + if (prog != null) { + prog.setCompleted(LOG); + } + + // Wrap result + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); + for (int i = 0; i < clusters.size(); i++) { + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { + @Override + protected Logging getLogger() { + return LOG; + } + + @Override + protected KMeansHybridLloydMacQueen<V, D> makeInstance() { + return new KMeansHybridLloydMacQueen<>(distanceFunction, k, maxiter, initializer); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index e692293c..686e2076 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.KMeansModel; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -93,15 +96,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { if (prog != null) { prog.incrementProcessed(LOG); } - boolean changed = assignToNearestCluster(relation, means, clusters); + boolean changed = assignToNearestCluster(relation, means, clusters, assignment); // Stop if no cluster assignment changed. if (!changed) { break; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index bb689bd3..a0f4bb3f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.KMeansModel; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -95,11 +98,9 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex // Initialize cluster and assign objects List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } - assignToNearestCluster(relation, means, clusters); - // Initial recomputation of the means. - means = means(clusters, means, relation); + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; // Refine result @@ -107,7 +108,7 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex if (prog != null) { prog.incrementProcessed(LOG); } - boolean changed = macQueenIterate(relation, means, clusters); + boolean changed = macQueenIterate(relation, means, clusters, assignment); if (!changed) { break; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index 302ca86b..6fc514eb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -84,8 +84,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean List<V> means = new ArrayList<>(k); - Random random = rnd.getRandom(); - DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter()); + Random random = rnd.getSingleThreadedRandom(); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, random).iter()); means.add(relation.get(first)); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); @@ -134,8 +134,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten // Chose first mean ArrayModifiableDBIDs means = DBIDUtil.newArray(k); - Random random = rnd.getRandom(); - DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, new Random(random.nextLong())).iter()); + Random random = rnd.getSingleThreadedRandom(); + DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random).iter()); means.add(first); ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index cc7aaa9e..0a97c4d3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.MeanModel; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -88,15 +91,16 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { - clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k))); } + WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1); IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { if (prog != null) { prog.incrementProcessed(LOG); } - boolean changed = assignToNearestCluster(relation, medians, clusters); + boolean changed = assignToNearestCluster(relation, medians, clusters, assignment); // Stop if no cluster assignment changed. if (!changed) { break; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index 87a0c7ae..41cca225 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -119,7 +118,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista * @return result */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); @@ -127,7 +126,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } Mean[] mdists = Mean.newArray(k); @@ -139,47 +138,47 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null; // Swap phase boolean changed = true; - while (changed) { - if (prog != null) { + while(changed) { + if(prog != null) { prog.incrementProcessed(LOG); } changed = false; // Try to swap the medoid with a better cluster member: int i = 0; - for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { DBID best = null; Mean bestm = mdists[i]; - for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { - if (DBIDUtil.equal(miter, iter)) { + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(DBIDUtil.equal(miter, iter)) { continue; } Mean mdist = new Mean(); - for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { + for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { mdist.put(distQ.distance(iter, iter2).doubleValue()); } - if (mdist.getMean() < bestm.getMean()) { + if(mdist.getMean() < bestm.getMean()) { best = DBIDUtil.deref(iter); bestm = mdist; } } - if (best != null && !DBIDUtil.equal(miter, best)) { + if(best != null && !DBIDUtil.equal(miter, best)) { changed = true; medoids.set(i, best); mdists[i] = bestm; } } // Reassign - if (changed) { + if(changed) { assignToNearestCluster(medoids, mdists, clusters, distQ); } } - if (prog != null) { + if(prog != null) { prog.setCompleted(LOG); } // Wrap result Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } @@ -200,27 +199,27 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista boolean changed = false; double[] dists = new double[k]; - for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { int minIndex = 0; double mindist = Double.POSITIVE_INFINITY; { int i = 0; - for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { dists[i] = distQ.distance(iditer, miter).doubleValue(); - if (dists[i] < mindist) { + if(dists[i] < mindist) { minIndex = i; mindist = dists[i]; } } } - if (clusters.get(minIndex).add(iditer)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; mdist[minIndex].put(mindist); // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { mdist[minIndex].put(dists[i], -1); break; } @@ -259,19 +258,19 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.intValue(); } ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index 1feda867..c9e1dc47 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -53,8 +53,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -124,7 +123,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @return result */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { - if (relation.size() <= 0) { + if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); @@ -133,7 +132,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); - for (int i = 0; i < k; i++) { + for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -145,8 +144,8 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase boolean changed = true; - while (changed) { - if (prog != null) { + while(changed) { + if(prog != null) { prog.incrementProcessed(LOG); } changed = false; @@ -155,57 +154,60 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist DBID bestid = null; int bestcluster = -1; int i = 0; - for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { - for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { - if (DBIDUtil.equal(miter, iter)) { + for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(DBIDUtil.equal(miter, iter)) { continue; } // double disti = distQ.distance(id, med).doubleValue(); double cost = 0; DBIDIter olditer = medoids.iter(); - for (int j = 0; j < k; j++, olditer.advance()) { - for (DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { + for(int j = 0; j < k; j++, olditer.advance()) { + for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { double distcur = distQ.distance(iter2, olditer).doubleValue(); double distnew = distQ.distance(iter2, iter).doubleValue(); - if (j == i) { + if(j == i) { // Cases 1 and 2. double distsec = second.doubleValue(iter2); - if (distcur > distsec) { + if(distcur > distsec) { // Case 1, other would switch to a third medoid cost += distsec - distcur; // Always positive! - } else { // Would remain with the candidate + } + else { // Would remain with the candidate cost += distnew - distcur; // Could be negative } - } else { + } + else { // Cases 3-4: objects from other clusters - if (distcur < distnew) { + if(distcur < distnew) { // Case 3: no change - } else { + } + else { // Case 4: would switch to new medoid cost += distnew - distcur; // Always negative } } } } - if (cost < best) { + if(cost < best) { best = cost; bestid = DBIDUtil.deref(iter); bestcluster = i; } } } - if (prog != null) { + if(prog != null) { prog.setCompleted(LOG); } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.debug("Best cost: " + best); } - if (bestid != null) { + if(bestid != null) { changed = true; medoids.set(bestcluster, bestid); } // Reassign - if (changed) { + if(changed) { // TODO: can we save some of these recomputations? assignToNearestCluster(medoids, ids, second, clusters, distQ); } @@ -213,7 +215,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist // Wrap result Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } @@ -234,30 +236,31 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { boolean changed = false; - for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { int minIndex = 0; double mindist = Double.POSITIVE_INFINITY; double mindist2 = Double.POSITIVE_INFINITY; { int i = 0; - for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { + for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { double dist = distQ.distance(iditer, miter).doubleValue(); - if (dist < mindist) { + if(dist < mindist) { minIndex = i; mindist2 = mindist; mindist = dist; - } else if (dist < mindist2) { + } + else if(dist < mindist2) { mindist2 = dist; } } } - if (clusters.get(minIndex).add(iditer)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for (int i = 0; i < k; i++) { - if (i != minIndex) { - if (clusters.get(i).remove(iditer)) { + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -296,19 +299,19 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { k = kP.intValue(); } ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); - if (config.grab(initialP)) { + if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { + maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index ee90e0dc..1329132e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -60,7 +60,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); List<V> means = new ArrayList<>(k); - final Random random = rnd.getRandom(); + final Random random = rnd.getSingleThreadedRandom(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); // Rescale diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java index 9f0a1923..79013364 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java @@ -93,7 +93,7 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv); List<V> means = new ArrayList<>(); for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) { - means.add((V) cluster.getModel().getMean()); + means.add(cluster.getModel().getMean()); } return means; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java index ed9a528d..1be19bd1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java @@ -1,4 +1,27 @@ /** * Quality measures for k-Means results. */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java new file mode 100644 index 00000000..55114f7d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java @@ -0,0 +1,384 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Cluster one-dimensional data by splitting the data set on local minima after + * performing kernel density estimation. + * + * @author Erich Schubert + */ +public class KNNKernelDensityMinimaClustering<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<ClusterModel>> implements ClusteringAlgorithm<Clustering<ClusterModel>> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(KNNKernelDensityMinimaClustering.class); + + /** + * Estimation mode. + * + * @apiviz.exclude + */ + public static enum Mode { + BALLOON, // Balloon estimator + SAMPLE, // Sample-point estimator + } + + /** + * Dimension to use for clustering. + */ + protected int dim; + + /** + * Kernel density function. + */ + protected KernelDensityFunction kernel; + + /** + * Estimation modes. + */ + protected Mode mode; + + /** + * Number of neighbors to use for bandwidth. + */ + protected int k; + + /** + * Window width, for local minima criterions. + */ + protected int minwindow; + + /** + * Constructor. + * + * @param dim Dimension to use for clustering + * @param kernel Kernel function + * @param mode Bandwidth mode + * @param k Number of neighbors + * @param minwindow Window size for comparison + */ + public KNNKernelDensityMinimaClustering(int dim, KernelDensityFunction kernel, Mode mode, int k, int minwindow) { + super(); + this.dim = dim; + this.kernel = kernel; + this.mode = mode; + this.k = k; + this.minwindow = minwindow; + } + + /** + * Run the clustering algorithm on a data relation. + * + * @param relation Relation + * @return Clustering result + */ + public Clustering<ClusterModel> run(Relation<V> relation) { + ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs()); + final int size = ids.size(); + + // Sort by the sole dimension + ids.sort(new VectorUtil.SortDBIDsBySingleDimension(relation, dim)); + + // Density storage. + WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.); + + DBIDArrayIter iter = ids.iter(), iter2 = ids.iter(); + + StepProgress sprog = LOG.isVerbose() ? new StepProgress("Clustering steps", 2) : null; + + if(sprog != null) { + sprog.beginStep(1, "Kernel density estimation.", LOG); + } + { + double[] scratch = new double[2 * k]; + iter.seek(0); + for(int i = 0; i < size; i++, iter.advance()) { + // Current value. + final double curv = relation.get(iter).doubleValue(dim); + + final int pre = Math.max(i - k, 0), prek = i - pre; + final int pos = Math.min(i + k, size - 1), posk = pos - i; + iter2.seek(pre); + for(int j = 0; j < prek; j++, iter2.advance()) { + scratch[j] = curv - relation.get(iter2).doubleValue(dim); + } + assert (iter2.getOffset() == i); + iter2.advance(); + for(int j = 0; j < posk; j++, iter2.advance()) { + scratch[prek + j] = relation.get(iter2).doubleValue(dim) - curv; + } + + assert (prek + posk >= k); + double kdist = QuickSelect.quickSelect(scratch, 0, prek + posk, k); + switch(mode){ + case BALLOON: { + double dens = 0.; + if(kdist > 0.) { + for(int j = 0; j < prek + posk; j++) { + dens += kernel.density(scratch[j] / kdist); + } + } + else { + dens = Double.POSITIVE_INFINITY; + } + assert (iter.getOffset() == i); + density.putDouble(iter, dens); + break; + } + case SAMPLE: { + if(kdist > 0.) { + iter2.seek(pre); + for(int j = 0; j < prek; j++, iter2.advance()) { + double delta = curv - relation.get(iter2).doubleValue(dim); + density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist)); + } + assert (iter2.getOffset() == i); + iter2.advance(); + for(int j = 0; j < posk; j++, iter2.advance()) { + double delta = relation.get(iter2).doubleValue(dim) - curv; + density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist)); + } + } + else { + iter2.seek(pre); + for(int j = 0; j < prek; j++, iter2.advance()) { + double delta = curv - relation.get(iter2).doubleValue(dim); + if(!(delta > 0.)) { + density.putDouble(iter2, Double.POSITIVE_INFINITY); + } + } + assert (iter2.getOffset() == i); + iter2.advance(); + for(int j = 0; j < posk; j++, iter2.advance()) { + double delta = relation.get(iter2).doubleValue(dim) - curv; + if(!(delta > 0.)) { + density.putDouble(iter2, Double.POSITIVE_INFINITY); + } + } + } + break; + } + default: + throw new UnsupportedOperationException("Unknown mode specified."); + } + } + } + + if(sprog != null) { + sprog.beginStep(2, "Local minima detection.", LOG); + } + Clustering<ClusterModel> clustering = new Clustering<>("onedimensional-kde-clustering", "One-Dimensional clustering using kernel density estimation."); + { + double[] scratch = new double[2 * minwindow + 1]; + int begin = 0; + int halfw = (minwindow + 1) >> 1; + iter.seek(0); + // Fill initial buffer. + for(int i = 0; i < size; i++, iter.advance()) { + final int m = i % scratch.length, t = (i - minwindow - 1) % scratch.length; + scratch[m] = density.doubleValue(iter); + if(i > scratch.length) { + double min = Double.POSITIVE_INFINITY; + for(int j = 0; j < scratch.length; j++) { + if(j != t && scratch[j] < min) { + min = scratch[j]; + } + } + // Local minimum: + if(scratch[t] < min) { + int end = i - minwindow + 1; + { // Test on which side the kNN is + iter2.seek(end); + double curv = relation.get(iter2).doubleValue(dim); + iter2.seek(end - halfw); + double left = relation.get(iter2).doubleValue(dim) - curv; + iter2.seek(end + halfw); + double right = curv - relation.get(iter2).doubleValue(dim); + if(left < right) { + end++; + } + } + iter2.seek(begin); + ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin); + for(int j = 0; j < end - begin; j++, iter2.advance()) { + cids.add(iter2); + } + clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER)); + begin = end; + } + } + } + // Extract last cluster + int end = size; + iter2.seek(begin); + ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin); + for(int j = 0; j < end - begin; j++, iter2.advance()) { + cids.add(iter2); + } + clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER)); + } + + if(sprog != null) { + sprog.setCompleted(LOG); + } + return clustering; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(new VectorFieldTypeInformation<>(NumberVector.class, dim + 1, Integer.MAX_VALUE)); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + /** + * Dimension to use for clustering. + */ + public static final OptionID DIM_ID = new OptionID("kernelcluster.dim", "Dimension to use for clustering. For one-dimensional data, use 0."); + + /** + * Kernel function. + */ + public static final OptionID KERNEL_ID = new OptionID("kernelcluster.kernel", "Kernel function for density estimation."); + + /** + * KDE mode. + */ + public static final OptionID MODE_ID = new OptionID("kernelcluster.mode", "Kernel density estimation mode (baloon estimator vs. sample point estimator)."); + + /** + * Number of neighbors for bandwidth estimation. + */ + public static final OptionID K_ID = new OptionID("kernelcluster.knn", "Number of nearest neighbors to use for bandwidth estimation."); + + /** + * Half window width to find local minima. + */ + public static final OptionID WINDOW_ID = new OptionID("kernelcluster.window", "Half width of sliding window to find local minima."); + + /** + * Dimension to use for clustering. + */ + protected int dim; + + /** + * Kernel density function. + */ + protected KernelDensityFunction kernel; + + /** + * Estimation modes. + */ + protected Mode mode; + + /** + * Number of neighbors to use for bandwidth. + */ + protected int k; + + /** + * Window width, for local minima criterions. + */ + protected int minwindow; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter dimP = new IntParameter(DIM_ID, 0); + dimP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + if(config.grab(dimP)) { + dim = dimP.intValue(); + } + + ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class); + if(config.grab(kernelP)) { + kernel = kernelP.instantiateClass(config); + } + + EnumParameter<Mode> modeP = new EnumParameter<>(MODE_ID, Mode.class, Mode.BALLOON); + if(config.grab(modeP)) { + mode = modeP.getValue(); + } + + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { + k = kP.intValue(); + } + + IntParameter windowP = new IntParameter(WINDOW_ID); + windowP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(windowP)) { + minwindow = windowP.intValue(); + } + } + + @Override + protected KNNKernelDensityMinimaClustering<V> makeInstance() { + return new KNNKernelDensityMinimaClustering<>(dim, kernel, mode, k, minwindow); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java new file mode 100644 index 00000000..c6c55244 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java @@ -0,0 +1,27 @@ +/** + * Clustering algorithms for one-dimensional data. + */ + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index db026e93..617d74cd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -56,8 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; @@ -594,14 +593,14 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter xsiP = new IntParameter(XSI_ID); - xsiP.addConstraint(new GreaterConstraint(0)); + xsiP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(xsiP)) { xsi = xsiP.intValue(); } DoubleParameter tauP = new DoubleParameter(TAU_ID); - tauP.addConstraint(new GreaterConstraint(0)); - tauP.addConstraint(new LessConstraint(1)); + tauP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + tauP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); if(config.grab(tauP)) { tau = tauP.doubleValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java new file mode 100644 index 00000000..5f798a66 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java @@ -0,0 +1,605 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceMaximumDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * <p>
+ * Provides the DOC algorithm, and it's heuristic variant, FastDOC. DOC is a
+ * sampling based subspace clustering algorithm.
+ * </p>
+ *
+ * <p>
+ * Reference: <br/>
+ * C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali<br />
+ * A Monte Carlo algorithm for fast projective clustering. <br/>
+ * In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02).
+ * </p>
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.has SubspaceModel
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("DOC: Density-based Optimal projective Clustering")
+@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", title = "A Monte Carlo algorithm for fast projective clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", url = "http://dx.doi.org/10.1145/564691.564739")
+public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(DOC.class);
+
+ /**
+ * Relative density threshold parameter alpha.
+ */
+ private double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ private double beta;
+
+ /**
+ * Half width parameter.
+ */
+ private double w;
+
+ /**
+ * Holds the value of {@link Parameterizer#HEURISTICS_ID}.
+ */
+ private boolean heuristics;
+
+ /**
+ * Holds the value of {@link Parameterizer#D_ZERO_ID}.
+ */
+ private int d_zero;
+
+ /**
+ * Randomizer used internally for sampling points.
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha α relative density threshold.
+ * @param beta β balancing parameter for size vs. dimensionality.
+ * @param w <em>w</em> half width parameter.
+ * @param heuristics whether to use heuristics (FastDOC) or not.
+ * @param random Random factory
+ */
+ public DOC(double alpha, double beta, double w, boolean heuristics, int d_zero, RandomFactory random) {
+ this.alpha = alpha;
+ this.beta = beta;
+ this.w = w;
+ this.heuristics = heuristics;
+ this.d_zero = d_zero;
+ this.rnd = random;
+ }
+
+ /**
+ * Performs the DOC or FastDOC (as configured) algorithm on the given
+ * Database.
+ *
+ * <p>
+ * This will run exhaustively, i.e. run DOC until no clusters are found
+ * anymore / the database size has shrunk below the threshold for minimum
+ * cluster size.
+ * </p>
+ *
+ * @param database Database
+ * @param relation Data relation
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ // Dimensionality of our set.
+ final int d = RelationUtil.dimensionality(relation);
+
+ // Get available DBIDs as a set we can remove items from.
+ ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
+
+ // Precompute values as described in Figure 2.
+ double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
+ // Outer loop count.
+ int n = (int) (2. / alpha);
+ // Inner loop count.
+ int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
+ if(heuristics) {
+ m = Math.min(m, Math.min(1000000, d * d));
+ }
+
+ // Minimum size for a cluster for it to be accepted.
+ int minClusterSize = (int) (alpha * S.size());
+
+ // List of all clusters we found.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("DOC Clusters", "DOC");
+
+ // Inform the user about the number of actual clusters found so far.
+ IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
+
+ // To not only find a single cluster, we continue running until our set
+ // of points is empty.
+ while(S.size() > minClusterSize) {
+ Cluster<SubspaceModel<V>> C;
+ if(heuristics) {
+ C = runFastDOC(relation, S, d, n, m, (int) r);
+ }
+ else {
+ C = runDOC(relation, S, d, n, m, (int) r, minClusterSize);
+ }
+
+ if(C == null) {
+ // Stop trying if we couldn't find a cluster.
+ break;
+ }
+ // Found a cluster, remember it, remove its points from the set.
+ result.addToplevelCluster(C);
+
+ // Remove all points of the cluster from the set and continue.
+ S.removeDBIDs(C.getIDs());
+
+ if(cprogress != null) {
+ cprogress.setProcessed(result.getAllClusters().size(), LOG);
+ }
+ }
+
+ // Add the remainder as noise.
+ if(S.size() > 0) {
+ BitSet alldims = new BitSet();
+ alldims.set(0, d);
+ result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel<>(new Subspace(alldims), Centroid.make(relation, S).toVector(relation))));
+ }
+
+ if(cprogress != null) {
+ cprogress.setCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Performs a single run of DOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @param minClusterSize Minimum size a cluster must have to be accepted.
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
+ final DoubleDistance wd = new DoubleDistance(w);
+ // Best cluster for the current run.
+ DBIDs C = null;
+ // Relevant attributes for the best cluster.
+ BitSet D = null;
+ // Quality of the best cluster.
+ double quality = Double.NEGATIVE_INFINITY;
+
+ // Bounds for our cluster.
+ // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new
+ // double[d], new double[d]);
+
+ // Weights for distance (= rectangle query)
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(new BitSet(d));
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq);
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+ DBIDArrayIter iter = S.iter();
+
+ for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension and build bounding box.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+ if(nD.cardinality() > 0) {
+ // Get all points in the box.
+ df.setSelectedDimensions(nD);
+ // TODO: add filtering capabilities into query API!
+ DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, wd));
+
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + nD.cardinality());
+ }
+
+ // Is the cluster large enough?
+ if(nC.size() < minClusterSize) {
+ // Too small.
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but it's too small.");
+ }
+ }
+ else {
+ // Better cluster than before?
+ double nQuality = computeClusterQuality(nC.size(), nD.cardinality());
+ if(nQuality > quality) {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
+ }
+ C = nC;
+ D = nD;
+ quality = nQuality;
+ }
+ else {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but we already have a better one.");
+ }
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ if(C != null) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Performs a single run of FastDOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
+ // Relevant attributes of highest cardinality.
+ BitSet D = null;
+ // The seed point for the best dimensions.
+ DBIDVar dV = DBIDUtil.newVar();
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+
+ DBIDArrayIter iter = S.iter();
+ outer: for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+
+ if(D == null || nD.cardinality() > D.cardinality()) {
+ D = nD;
+ dV.set(iter);
+
+ if(D.cardinality() >= d_zero) {
+ if(iprogress != null) {
+ iprogress.setProcessed(iprogress.getTotal(), LOG);
+ }
+ break outer;
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ // If no relevant dimensions were found, skip it.
+ if(D == null || D.cardinality() == 0) {
+ return null;
+ }
+
+ // Get all points in the box.
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
+
+ // TODO: add filtering capabilities into query API!
+ DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, new DoubleDistance(w)));
+
+ // If we have a non-empty cluster, return it.
+ if(C.size() > 0) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Utility method to test if a given dimension is relevant as determined via a
+ * set of reference points (i.e. if the variance along the attribute is lower
+ * than the threshold).
+ *
+ * @param dimension the dimension to test.
+ * @param relation used to get actual values for DBIDs.
+ * @param points the points to test.
+ * @return <code>true</code> if the dimension is relevant.
+ */
+ private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
+ double min = Double.POSITIVE_INFINITY;
+ double max = Double.NEGATIVE_INFINITY;
+ for(DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
+ V xV = relation.get(iter);
+ min = Math.min(min, xV.doubleValue(dimension));
+ max = Math.max(max, xV.doubleValue(dimension));
+ if(max - min > w) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Utility method to create a subspace cluster from a list of DBIDs and the
+ * relevant attributes.
+ *
+ * @param relation to compute a centroid.
+ * @param C the cluster points.
+ * @param D the relevant dimensions.
+ * @return an object representing the subspace cluster.
+ */
+ private Cluster<SubspaceModel<V>> makeCluster(Relation<V> relation, DBIDs C, BitSet D) {
+ DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
+ Cluster<SubspaceModel<V>> cluster = new Cluster<>(ids);
+ cluster.setModel(new SubspaceModel<>(new Subspace(D), Centroid.make(relation, ids).toVector(relation)));
+ return cluster;
+ }
+
+ /**
+ * Computes the quality of a cluster based on its size and number of relevant
+ * attributes, as described via the μ-function from the paper.
+ *
+ * @param clusterSize the size of the cluster.
+ * @param numRelevantDimensions the number of dimensions relevant to the
+ * cluster.
+ * @return a quality measure (only use this to compare the quality to that
+ * other clusters).
+ */
+ private double computeClusterQuality(int clusterSize, int numRelevantDimensions) {
+ return clusterSize * Math.pow(1. / beta, numRelevantDimensions);
+ }
+
+ // ---------------------------------------------------------------------- //
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("doc.alpha", "Minimum relative density for a set of points to be considered a cluster (|C|>=doc.alpha*|S|).");
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ public static final OptionID BETA_ID = new OptionID("doc.beta", "Preference of cluster size versus number of relevant dimensions (higher value means higher priority on larger clusters).");
+
+ /**
+ * Half width parameter.
+ */
+ public static final OptionID W_ID = new OptionID("doc.w", "Maximum extent of scattering of points along a single attribute for the attribute to be considered relevant.");
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ public static final OptionID HEURISTICS_ID = new OptionID("doc.fastdoc", "Use heuristics as described, thus using the FastDOC algorithm (not yet implemented).");
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ public static final OptionID D_ZERO_ID = new OptionID("doc.d0", "Parameter for FastDOC, setting the number of relevant attributes which, when found for a cluster, are deemed enough to stop iterating.");
+
+ /**
+ * Random seeding parameter.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("doc.random-seed", "Random seed, for reproducible experiments.");
+
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ protected double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ protected double beta;
+
+ /**
+ * Half width parameter.
+ */
+ protected double w;
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ protected boolean heuristics;
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ protected int d_zero;
+
+ /**
+ * Random seeding factory.
+ */
+ protected RandomFactory random = RandomFactory.DEFAULT;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(BETA_ID, 0.8);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(param)) {
+ beta = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(W_ID, 0.05);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ w = param.getValue();
+ }
+ }
+
+ {
+ Flag param = new Flag(HEURISTICS_ID);
+ if(config.grab(param)) {
+ heuristics = param.getValue();
+ }
+ }
+
+ if(heuristics) {
+ IntParameter param = new IntParameter(D_ZERO_ID, 5);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ d_zero = param.getValue();
+ }
+ }
+
+ {
+ RandomParameter param = new RandomParameter(RANDOM_ID);
+ if(config.grab(param)) {
+ random = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected DOC<V> makeInstance() {
+ return new DOC<>(alpha, beta, w, heuristics, d_zero, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index b17ebebb..cd5e51b8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -69,8 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -170,12 +169,12 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin */ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { // Instantiate DiSH distance (and thus run the preprocessor) - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("*** Run DiSH preprocessor."); } DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation); // Configure and run OPTICS. - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("*** Run OPTICS algorithm."); } ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters); @@ -186,7 +185,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin optics = opticsconfig.tryInstantiate(cls); ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("*** Compute Clusters."); } return computeClusters(relation, opticsResult, dishDistanceQuery); @@ -206,10 +205,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // extract clusters Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 1: extract clusters"); - for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); } } @@ -218,10 +217,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // check if there are clusters < minpts checkClusters(database, distFunc, clustersMap, minpts); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 2: check clusters"); - for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size()); } } @@ -230,9 +229,9 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // sort the clusters List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 3: sort clusters"); - for (Cluster<SubspaceModel<V>> c : clusters) { + for(Cluster<SubspaceModel<V>> c : clusters) { msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size()); } LOG.verbose(msg.toString()); @@ -241,14 +240,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // build the hierarchy Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); buildHierarchy(database, distFunc, clustering, clusters, dimensionality); - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 4: build hierarchy"); - for (Cluster<SubspaceModel<V>> c : clusters) { + for(Cluster<SubspaceModel<V>> c : clusters) { msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); - for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { + for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { msg.append("\n parent ").append(iter.get()); } - for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { + for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { msg.append("\n child ").append(iter.get()); } } @@ -256,8 +255,8 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // build result - for (Cluster<SubspaceModel<V>> c : clusters) { - if (clustering.getClusterHierarchy().numParents(c) == 0) { + for(Cluster<SubspaceModel<V>> c : clusters) { + if(clustering.getClusterHierarchy().numParents(c) == 0) { clustering.addToplevelCluster(c); } } @@ -278,7 +277,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>(); Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>(); Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); - for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { + for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next(); entryMap.put(entry.getID(), entry); @@ -287,43 +286,43 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // get the list of (parallel) clusters for the preference vector List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector); - if (parallelClusters == null) { + if(parallelClusters == null) { parallelClusters = new ArrayList<>(); clustersMap.put(preferenceVector, parallelClusters); } // look for the proper cluster Pair<BitSet, ArrayModifiableDBIDs> cluster = null; - for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database); PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector); - if (dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { + if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) { double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector()); - if (d <= 2 * epsilon) { + if(d <= 2 * epsilon) { cluster = c; break; } } } - if (cluster == null) { + if(cluster == null) { cluster = new Pair<>(preferenceVector, DBIDUtil.newArray()); parallelClusters.add(cluster); } cluster.second.add(entry.getID()); entryToClusterMap.put(entry.getID(), cluster); - if (progress != null) { + if(progress != null) { progress.setProcessed(++processed, LOG); } } - if (progress != null) { + if(progress != null) { progress.ensureCompleted(LOG); } - if (LOG.isDebuggingFiner()) { + if(LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder("Step 0"); - for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { - for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { + for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) { msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size()); } } @@ -331,24 +330,24 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // add the predecessor to the cluster - for (BitSet pv : clustersMap.keySet()) { + for(BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for (Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { - if (cluster.second.isEmpty()) { + for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) { + if(cluster.second.isEmpty()) { continue; } DBID firstID = cluster.second.get(0); ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID); DBID predecessorID = entry.getPredecessorID(); - if (predecessorID == null) { + if(predecessorID == null) { continue; } ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID); // parallel cluster - if (predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { + if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) { continue; } - if (predecessor.getReachability().compareTo(entry.getReachability()) < 0) { + if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) { continue; } @@ -375,16 +374,17 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin final int db_dim = RelationUtil.dimensionality(database); // int num = 1; List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>(); - for (BitSet pv : clustersMap.keySet()) { + for(BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for (int i = 0; i < parallelClusters.size(); i++) { + for(int i = 0; i < parallelClusters.size(); i++) { Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i); Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second); cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, ""); - if (parallelClusters.size() > 1) { + if(parallelClusters.size() > 1) { cluster.setName("Cluster_" + subspace + "_" + i); - } else { + } + else { cluster.setName("Cluster_" + subspace); } clusters.add(cluster); @@ -417,11 +417,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>(); Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray()); - for (BitSet pv : clustersMap.keySet()) { + for(BitSet pv : clustersMap.keySet()) { // noise - if (pv.cardinality() == 0) { + if(pv.cardinality() == 0) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { noise.second.addDBIDs(c.second); } } @@ -429,10 +429,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin else { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size()); - for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { - if (!pv.equals(new BitSet()) && c.second.size() < minpts) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { + if(!pv.equals(new BitSet()) && c.second.size() < minpts) { notAssigned.add(c); - } else { + } + else { newParallelClusters.add(c); } } @@ -443,14 +444,15 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin clustersMap.clear(); clustersMap.putAll(newClustersMap); - for (Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { - if (c.second.isEmpty()) { + for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) { + if(c.second.isEmpty()) { continue; } Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap); - if (parent != null) { + if(parent != null) { parent.second.addDBIDs(c.second); - } else { + } + else { noise.second.addDBIDs(c.second); } } @@ -477,23 +479,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin BitSet childPV = child.first; int childCardinality = childPV.cardinality(); - for (BitSet parentPV : clustersMap.keySet()) { + for(BitSet parentPV : clustersMap.keySet()) { int parentCardinality = parentPV.cardinality(); - if (parentCardinality >= childCardinality) { + if(parentCardinality >= childCardinality) { continue; } - if (resultCardinality != -1 && parentCardinality <= resultCardinality) { + if(resultCardinality != -1 && parentCardinality <= resultCardinality) { continue; } BitSet pv = (BitSet) childPV.clone(); pv.and(parentPV); - if (pv.equals(parentPV)) { + if(pv.equals(parentPV)) { List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV); - for (Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { + for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) { V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database); double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV); - if (d <= 2 * epsilon) { + if(d <= 2 * epsilon) { result = parent; resultCardinality = parentCardinality; break; @@ -519,57 +521,59 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin final int db_dim = RelationUtil.dimensionality(database); Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy(); - for (int i = 0; i < clusters.size() - 1; i++) { + for(int i = 0; i < clusters.size() - 1; i++) { Cluster<SubspaceModel<V>> c_i = clusters.get(i); int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality(); V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database); - for (int j = i + 1; j < clusters.size(); j++) { + for(int j = i + 1; j < clusters.size(); j++) { Cluster<SubspaceModel<V>> c_j = clusters.get(j); int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality(); - if (subspaceDim_i < subspaceDim_j) { - if (LOG.isDebugging()) { + if(subspaceDim_i < subspaceDim_j) { + if(LOG.isDebugging()) { msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']'); msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']'); } // noise level reached - if (c_j.getModel().getSubspace().dimensionality() == 0) { + if(c_j.getModel().getSubspace().dimensionality() == 0) { // no parents exists -> parent is noise - if (hier.numParents(c_i) == 0) { + if(hier.numParents(c_i) == 0) { clustering.addChildCluster(c_j, c_i); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); msg.append(']'); } } - } else { + } + else { V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions()); double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector()); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { msg.append("\n dist = ").append(distance.getCorrelationValue()); } - if (distance.getCorrelationValue() == subspaceDim_j) { - if (LOG.isDebugging()) { + if(distance.getCorrelationValue() == subspaceDim_j) { + if(LOG.isDebugging()) { msg.append("\n d = ").append(d); } - if (d <= 2 * epsilon) { + if(d <= 2 * epsilon) { // no parent exists or c_j is not a parent of the already // existing parents - if (hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { + if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { clustering.addChildCluster(c_j, c_i); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of ["); msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); msg.append(']'); } } - } else { + } + else { throw new RuntimeException("Should never happen: d = " + d); } } @@ -577,7 +581,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { LOG.debug(msg.toString()); } } @@ -599,11 +603,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin int dimensionality = RelationUtil.dimensionality(database); int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality(); - for (; iter.valid(); iter.advance()) { + for(; iter.valid(); iter.advance()) { Cluster<SubspaceModel<V>> child = iter.get(); V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions()); - if (distance.getCorrelationValue() == subspaceDim_parent) { + if(distance.getCorrelationValue() == subspaceDim_parent) { return true; } } @@ -642,14 +646,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin super.makeOptions(config); DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, 0.001); - epsilonP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(epsilonP)) { + epsilonP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(epsilonP)) { epsilon = epsilonP.doubleValue(); } IntParameter muP = new IntParameter(MU_ID, 1); - muP.addConstraint(new GreaterConstraint(0)); - if (config.grab(muP)) { + muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(muP)) { mu = muP.intValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 9ac7c072..3f135564 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -34,8 +34,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -95,8 +94,8 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0)); - alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = 0.0;
if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java new file mode 100644 index 00000000..9d1ee94d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java @@ -0,0 +1,1000 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.PoissonDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * P3C: A Robust Projected Clustering Algorithm.
+ *
+ * <p>
+ * Reference: <br/>
+ * Gabriela Moise, Jörg Sander, Martin Ester<br />
+ * P3C: A Robust Projected Clustering Algorithm.<br/>
+ * In: Proc. Sixth International Conference on Data Mining (ICDM '06)
+ * </p>
+ *
+ * This is not a complete implementation of P3C, but good enough for most users.
+ * Improvements are welcome. The most obviously missing step is section 3.5 of
+ * P3C, where the cluster subspaces are refined.
+ *
+ * @author Florian Nuecke
+ * @author Erich Schubert
+ *
+ * @apiviz.uses EM
+ * @apiviz.has SubspaceModel
+ * @apiviz.has ClusterCandidate
+ * @apiviz.has Signature
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("P3C: A Robust Projected Clustering Algorithm.")
+@Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
+public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(P3C.class);
+
+ /**
+ * Parameter for the Poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existing in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ /**
+ * Alpha threshold for testing.
+ */
+ protected double alpha = 0.001;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha ChiSquared test threshold
+ * @param poissonThreshold Poisson test threshold
+ * @param maxEmIterations Maximum number of EM iterations
+ * @param emDelta EM stopping threshold
+ * @param minClusterSize Minimum cluster size
+ */
+ public P3C(double alpha, double poissonThreshold, int maxEmIterations, double emDelta, int minClusterSize) {
+ super();
+ this.alpha = alpha;
+ this.poissonThreshold = poissonThreshold;
+ this.maxEmIterations = maxEmIterations;
+ this.emDelta = emDelta;
+ this.minClusterSize = minClusterSize;
+ }
+
+ /**
+ * Performs the P3C algorithm on the given Database.
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+
+ // Overall progress.
+ StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
+ }
+
+ // Desired number of bins, as per Sturge:
+ final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+
+ // Perform 1-dimensional projections, and split into bins.
+ SetDBIDs[][] partitions = partitionData(relation, binCount);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
+ }
+
+ // Set markers for each attribute until they're all deemed uniform.
+ final long[][] markers = new long[dim][];
+ int numuniform = 0;
+ for(int d = 0; d < dim; d++) {
+ final SetDBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d] = BitsUtil.zero(binCount);
+ int card = 0;
+ while(card < dim - 1) {
+ // Find bin with largest support, test only the dimensions that were not
+ // previously marked.
+ int bestBin = chiSquaredUniformTest(parts, marked, card);
+ if(bestBin < 0) {
+ numuniform++;
+ break; // Uniform
+ }
+ BitsUtil.setI(marked, bestBin);
+ card++;
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
+ }
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
+ }
+
+ ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
+ }
+
+ ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
+ }
+
+ clusterCores = pruneRedundantClusterCores(clusterCores);
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of cluster cores found: " + clusterCores.size());
+ }
+
+ if(clusterCores.size() == 0) {
+ stepProgress.setCompleted(LOG);
+ Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
+ c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+ return c;
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
+ }
+
+ // Track objects not assigned to any cluster:
+ ModifiableDBIDs noise = DBIDUtil.newHashSet();
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ int k = clusterCores.size();
+ double[] clusterWeights = new double[k];
+ computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+
+ // Initial estimate of covariances, to assign noise objects
+ Vector[] means = new Vector[k];
+ Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
+ double[] normDistrFactor = new double[k];
+ Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
+
+ double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
+ final double emOld = emNew;
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ // reassign probabilities
+ emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("iteration " + it + " - expectation value: " + emNew);
+ }
+ if((emNew - emOld) <= emDelta) {
+ break;
+ }
+ }
+
+ // Perform EM clustering.
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(6, "Generating hard clustering.", LOG);
+ }
+
+ // Create a hard clustering, making sure each data point only is part of one
+ // cluster, based on the best match from the membership matrix.
+ ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
+ }
+
+ // Outlier detection. Remove points from clusters that have a Mahalanobis
+ // distance larger than the critical value of the ChiSquare distribution.
+ findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(8, "Removing empty clusters.", LOG);
+ }
+
+ // Remove near-empty clusters.
+ for(Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext();) {
+ ClusterCandidate cand = it.next();
+ final int size = cand.ids.size();
+ if(size < minClusterSize) {
+ if(size > 0) {
+ noise.addDBIDs(cand.ids);
+ }
+ it.remove();
+ }
+ }
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
+ }
+
+ // TODO Check all attributes previously deemed uniform (section 3.5).
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(9, "Generating final result.", LOG);
+ }
+
+ // Generate final output.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+ for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
+ ClusterCandidate candidate = clusterCandidates.get(cluster);
+ CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
+ result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+ }
+ LOG.verbose("Noise size: " + noise.size());
+ if(noise.size() > 0) {
+ result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+ }
+
+ if(stepProgress != null) {
+ stepProgress.ensureCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Construct the 1-signatures by merging adjacent dense bins.
+ *
+ * @param partitions Initial partitions.
+ * @param markers Markers for dense partitions.
+ * @return 1-signatures
+ */
+ private ArrayList<Signature> constructOneSignatures(SetDBIDs[][] partitions, final long[][] markers) {
+ final int dim = partitions.length;
+ // Generate projected p-signature intervals.
+ ArrayList<Signature> signatures = new ArrayList<>();
+ for(int d = 0; d < dim; d++) {
+ final DBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d];
+ // Find sequences of 1s in marked.
+ for(int start = BitsUtil.nextSetBit(marked, 0); start >= 0;) {
+ int end = BitsUtil.nextClearBit(marked, start + 1);
+ end = (end == -1) ? dim : end;
+ int[] signature = new int[dim << 1];
+ Arrays.fill(signature, -1);
+ signature[d << 1] = start;
+ signature[(d << 1) + 1] = end - 1; // inclusive
+ HashSetModifiableDBIDs sids = unionDBIDs(parts, start, end /* exclusive */);
+ if(LOG.isDebugging()) {
+ LOG.debug("1-signature: " + d + " " + start + "-" + (end - 1));
+ }
+ signatures.add(new Signature(signature, sids));
+ start = (end < dim) ? BitsUtil.nextSetBit(marked, end + 1) : -1;
+ }
+ }
+ return signatures;
+ }
+
+ /**
+ * Merge 1-signatures into p-signatures.
+ *
+ * @param binCount Number of bins in each dimension.
+ * @param signatures 1-signatures
+ * @return p-signatures
+ */
+ private ArrayList<Signature> mergeClusterCores(final int binCount, ArrayList<Signature> signatures) {
+ MutableProgress mergeProgress = LOG.isVerbose() ? new MutableProgress("Merging signatures.", signatures.size(), LOG) : null;
+
+ // Annotate dimensions to 1-signatures for quick stopping.
+ int[] firstdim = new int[signatures.size()];
+ for(int i = 0; i < signatures.size(); i++) {
+ firstdim[i] = signatures.get(i).getFirstDim();
+ }
+ LOG.debug("First dimensions: " + FormatUtil.format(firstdim));
+
+ // Merge to (p+1)-signatures (cluster cores).
+ ArrayList<Signature> clusterCores = new ArrayList<>(signatures);
+ // Try adding merge 1-signature with each cluster core.
+ for(int i = 0; i < clusterCores.size(); i++) {
+ final Signature parent = clusterCores.get(i);
+ final int end = parent.getFirstDim();
+ for(int j = 0; j < signatures.size() && firstdim[j] < end; j++) {
+ final Signature onesig = signatures.get(j);
+ final Signature merge = mergeSignatures(parent, onesig, binCount);
+ if(merge != null) {
+ // We add each potential core to the list to allow remaining
+ // 1-signatures to try merging with this p-signature as well.
+ clusterCores.add(merge);
+ // Flag both "parents" for removal.
+ parent.prune = true;
+ onesig.prune = true;
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setTotal(clusterCores.size());
+ mergeProgress.incrementProcessed(LOG);
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setProcessed(mergeProgress.getTotal(), LOG);
+ }
+ return clusterCores;
+ }
+
+ private ArrayList<Signature> pruneRedundantClusterCores(ArrayList<Signature> clusterCores) {
+ // Prune cluster cores based on Definition 3, Condition 2.
+ ArrayList<Signature> retain = new ArrayList<>(clusterCores.size());
+ outer: for(Signature clusterCore : clusterCores) {
+ if(clusterCore.prune) {
+ continue;
+ }
+ for(int k = 0; k < clusterCores.size(); k++) {
+ Signature other = clusterCores.get(k);
+ if(other != clusterCore) {
+ if(other.isSuperset(clusterCore)) {
+ continue outer;
+ }
+ }
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Retained cluster core: " + clusterCore);
+ }
+ retain.add(clusterCore);
+ }
+ clusterCores = retain;
+ return clusterCores;
+ }
+
+ /**
+ * Partition the data set into {@code bins} bins in each dimension
+ * <i>independently</i>.
+ *
+ * This can be used to construct a grid approximation of the data using O(d n)
+ * memory.
+ *
+ * When a dimension is found to be constant, it will not be partitioned, but
+ * instead the corresponding array will be set to {@code null}.
+ *
+ * @param relation Data relation to partition
+ * @param bins Number of bins
+ * @return Partitions of each dimension.
+ */
+ private SetDBIDs[][] partitionData(final Relation<V> relation, final int bins) {
+ final int dim = RelationUtil.dimensionality(relation);
+ SetDBIDs[][] partitions = new SetDBIDs[dim][bins];
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ DBIDArrayIter iter = ids.iter(); // will be reused.
+ SortDBIDsBySingleDimension sorter = new VectorUtil.SortDBIDsBySingleDimension(relation, 0);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d);
+ ids.sort(sorter);
+ // Minimum:
+ iter.seek(0);
+ double min = relation.get(iter).doubleValue(d);
+ // Extend:
+ iter.seek(ids.size() - 1);
+ double delta = (relation.get(iter).doubleValue(d) - min) / bins;
+ if(delta > 0.) {
+ SetDBIDs[] dimparts = partitions[d];
+ double split = min + delta;
+ HashSetModifiableDBIDs pids = DBIDUtil.newHashSet();
+ dimparts[0] = pids;
+ int i = 0;
+ for(iter.seek(0); iter.valid(); iter.advance()) {
+ final double v = relation.get(iter).doubleValue(d);
+ if(v <= split || i == dimparts.length - 1) {
+ pids.add(iter);
+ }
+ else {
+ i++;
+ split += delta;
+ pids = DBIDUtil.newHashSet();
+ dimparts[i] = pids;
+ }
+ }
+ for(++i; i < dimparts.length; ++i) {
+ dimparts[i] = pids;
+ }
+ }
+ else {
+ partitions[d] = null; // Flag whole dimension as bad
+ }
+ }
+ return partitions;
+ }
+
+ /**
+ * Compute the union of multiple DBID sets.
+ *
+ * @param parts Parts array
+ * @param start Array start index
+ * @param end Array end index (exclusive)
+ * @return
+ */
+ protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
+ int sum = 0;
+ for(int i = start; i < end; i++) {
+ sum += parts[i].size();
+ }
+ HashSetModifiableDBIDs sids = DBIDUtil.newHashSet(sum);
+ for(int i = start; i < end; i++) {
+ sids.addDBIDs(parts[i]);
+ }
+ return sids;
+ }
+
+ /**
+ * Performs a ChiSquared test to determine whether an attribute has a uniform
+ * distribution.
+ *
+ * @param parts Data partitions.
+ * @param marked the marked bins that should be ignored.
+ * @param card Cardinality
+ * @return Position of maximum, or -1 when uniform.
+ */
+ private int chiSquaredUniformTest(SetDBIDs[] parts, long[] marked, int card) {
+ // Remaining number of bins.
+ final int binCount = parts.length - card;
+ // Get global mean over all unmarked bins.
+ int max = 0, maxpos = -1;
+ MeanVariance mv = new MeanVariance();
+ for(int i = 0; i < parts.length; i++) {
+ // Ignore already marked bins.
+ if(BitsUtil.get(marked, i)) {
+ continue;
+ }
+ final int binSupport = parts[i].size();
+ mv.put(binSupport);
+ if(binSupport > max) {
+ max = binSupport;
+ maxpos = i;
+ }
+ }
+ if(mv.getCount() < 1. || !(mv.getNaiveVariance() > 0.)) {
+ return -1;
+ }
+ // ChiSquare statistic is the naive variance of the sizes!
+ final double chiSquare = mv.getNaiveVariance() / mv.getMean();
+ final double test = ChiSquaredDistribution.cdf(chiSquare, Math.max(1, binCount - card - 1));
+ if((1. - alpha) < test) {
+ return maxpos;
+ }
+ return -1;
+ }
+
+ /**
+ * Computes a fuzzy membership with the weights based on which cluster cores
+ * each data point is part of.
+ *
+ * @param relation Data relation
+ * @param clusterCores the cluster cores.
+ * @param unassigned set to which to add unassigned points.
+ * @param probClusterIGivenX Membership probabilities.
+ * @param clusterWeights Cluster weights
+ */
+ private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+ final int n = relation.size();
+ final int k = clusterCores.size();
+
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ int count = 0;
+ double[] weights = new double[k];
+ for(int cluster = 0; cluster < k; ++cluster) {
+ if(clusterCores.get(cluster).ids.contains(iter)) {
+ weights[cluster] = 1.;
+ ++count;
+ }
+ }
+
+ // Set value(s) in membership matrix.
+ if(count > 0) {
+ // Rescale.
+ VMath.timesEquals(weights, 1. / count);
+ VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+ }
+ else {
+ // Does not match any cluster, mark it.
+ unassigned.add(iter);
+ }
+ probClusterIGivenX.put(iter, weights);
+ }
+ }
+
+ /**
+ * Assign unassigned objects to best candidate based on shortest Mahalanobis
+ * distance.
+ *
+ * @param relation Data relation
+ * @param probClusterIGivenX fuzzy membership matrix.
+ * @param means Cluster means.
+ * @param invCovMatr Cluster covariance matrices.
+ * @param clusterWeights
+ * @param assigned mapping of matrix row to DBID.
+ * @param unassigned the list of points not yet assigned.
+ */
+ private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+ if(unassigned.size() == 0) {
+ return;
+ }
+ final int k = means.length;
+ double pweight = 1. / relation.size();
+
+ for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
+ // Find the best matching known cluster core using the Mahalanobis
+ // distance.
+ Vector v = relation.get(iter).getColumnVector();
+ int bestCluster = -1;
+ double minDistance = Double.POSITIVE_INFINITY;
+ for(int c = 0; c < k; ++c) {
+ final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+ if(distance < minDistance) {
+ minDistance = distance;
+ bestCluster = c;
+ }
+ }
+ // Assign to best core.
+ double[] weights = new double[k];
+ weights[bestCluster] = 1.0;
+ clusterWeights[bestCluster] += pweight;
+ probClusterIGivenX.put(iter, weights);
+ }
+
+ // Clear the list of unassigned objects.
+ unassigned.clear();
+ }
+
+ /**
+ * Creates a hard clustering from the specified soft membership matrix.
+ *
+ * @param probClusterIGivenX the membership matrix.
+ * @param dbids mapping matrix row to DBID.
+ * @return a hard clustering based on the matrix.
+ */
+ private ArrayList<ClusterCandidate> hardClustering(WritableDataStore<double[]> probClusterIGivenX, List<Signature> clusterCores, DBIDs dbids) {
+ final int k = clusterCores.size();
+
+ // Initialize cluster sets.
+ ArrayList<ClusterCandidate> candidates = new ArrayList<>();
+ for(Signature sig : clusterCores) {
+ candidates.add(new ClusterCandidate(sig));
+ }
+
+ // Perform hard partitioning, assigning each data point only to one cluster,
+ // namely that one it is most likely to belong to.
+ for(DBIDIter iter = dbids.iter(); iter.valid(); iter.advance()) {
+ final double[] probs = probClusterIGivenX.get(iter);
+ int bestCluster = 0;
+ double bestProbability = probs[0];
+ for(int c = 1; c < k; ++c) {
+ if(probs[c] > bestProbability) {
+ bestCluster = c;
+ bestProbability = probs[c];
+ }
+ }
+ candidates.get(bestCluster).ids.add(iter);
+ }
+
+ return candidates;
+ }
+
+ /**
+ * Performs outlier detection by testing the Mahalanobis distance of each
+ * point in a cluster against the critical value of the ChiSquared
+ * distribution with as many degrees of freedom as the cluster has relevant
+ * attributes.
+ *
+ * @param relation Data relation
+ * @param means Cluster means
+ * @param invCovMatr Inverse covariance matrixes
+ * @param clusterCandidates the list of clusters to check.
+ * @param nonUniformDimensionCount the number of dimensions to consider when
+ * testing.
+ * @param noise the set to which to add points deemed outliers.
+ */
+ private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
+ final int k = clusterCandidates.size();
+
+ for(int c = 0; c < k; ++c) {
+ final ClusterCandidate candidate = clusterCandidates.get(c);
+ if(candidate.ids.size() < 2) {
+ continue;
+ }
+ final int dof = candidate.dimensions.cardinality();
+ final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+ for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
+ final Vector mean = means[c];
+ final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
+ final Matrix invCov = invCovMatr[c];
+ final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+ if(distance >= threshold) {
+ // Outlier, remove it and add it to the outlier set.
+ noise.add(iter);
+ iter.remove();
+ }
+ }
+ }
+ }
+
+ /**
+ * Generates a merged signature of this and another one, where the other
+ * signature must be a 1-signature.
+ *
+ * @param first First signature.
+ * @param second Second signature, must be a 1-signature.
+ * @param numBins Number of bins per dimension.
+ * @return the merged signature, or null if the merge failed.
+ */
+ protected Signature mergeSignatures(Signature first, Signature second, int numBins) {
+ int d2 = -1;
+ for(int i = 0; i < second.spec.length; i += 2) {
+ if(second.spec[i] >= 0) {
+ assert (d2 == -1) : "Merging with non-1-signature?!?";
+ d2 = i;
+ }
+ }
+ assert (d2 >= 0) : "Merging with empty signature?";
+
+ // Avoid generating redundant signatures.
+ if(first.spec[d2] >= 0) {
+ return null;
+ }
+
+ // Definition 3, Condition 1:
+ // True support:
+ final ModifiableDBIDs intersection = DBIDUtil.intersection(first.ids, second.ids);
+ final int support = intersection.size();
+ // Interval width, computed using selected number of bins / total bins
+ double width = (second.spec[d2 + 1] - second.spec[d2] + 1.) / (double) numBins;
+ // Expected size thus:
+ double expect = first.ids.size() * width;
+ if(support <= expect || support < minClusterSize) {
+ return null;
+ }
+ final double test = PoissonDistribution.rawProbability(support, expect);
+ if((poissonThreshold) <= test) {
+ return null;
+ }
+ // Create merged signature.
+ int[] spec = first.spec.clone();
+ spec[d2] = second.spec[d2];
+ spec[d2 + 1] = second.spec[d2];
+
+ final Signature newsig = new Signature(spec, intersection);
+ if(LOG.isDebugging()) {
+ LOG.debug(newsig.toString());
+ }
+ return newsig;
+ }
+
+ /**
+ * P3C Cluster signature.
+ *
+ * @author Erich Schubert
+ */
+ private static class Signature {
+ /**
+ * Subspace specification
+ */
+ int[] spec;
+
+ /**
+ * Object ids.
+ */
+ DBIDs ids;
+
+ /**
+ * Pruning flag.
+ */
+ boolean prune = false;
+
+ /**
+ * Constructor.
+ *
+ * @param spec Subspace specification
+ * @param ids IDs.
+ */
+ private Signature(int[] spec, DBIDs ids) {
+ super();
+ this.spec = spec;
+ this.ids = ids;
+ }
+
+ /**
+ * Test whether this is a superset of the other signature.
+ *
+ * @param other Other signature.
+ * @return {@code true} when this is a superset.
+ */
+ public boolean isSuperset(Signature other) {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] != other.spec[i] || spec[i + 1] != other.spec[i]) {
+ if(other.spec[i] != -1) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Find the first dimension set in this signature.
+ *
+ * @return Dimension
+ */
+ public int getFirstDim() {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ return (i >>> 1);
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ int p = 0;
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ p++;
+ }
+ }
+ StringBuilder buf = new StringBuilder();
+ buf.append(p).append("-signature: ");
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ buf.append(i >>> 1).append(':');
+ buf.append(spec[i]).append('-').append(spec[i + 1]).append(' ');
+ }
+ }
+ buf.append(" size: ").append(ids.size());
+ return buf.toString();
+ }
+ }
+
+ /**
+ * This class is used to represent potential clusters.
+ *
+ * @author Erich Schubert
+ */
+ private static class ClusterCandidate {
+ /**
+ * Selected dimensions
+ */
+ public final BitSet dimensions;
+
+ /**
+ * Objects contained in cluster.
+ */
+ public final ModifiableDBIDs ids;
+
+ /**
+ * Constructor.
+ *
+ * @param clusterCore Signature
+ */
+ public ClusterCandidate(Signature clusterCore) {
+ this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+ for(int i = 0; i < clusterCore.spec.length; i += 2) {
+ this.dimensions.set(i >> 1);
+ }
+ this.ids = DBIDUtil.newArray(clusterCore.ids.size());
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the chi squared test threshold.
+ */
+ public static final OptionID ALPHA_THRESHOLD_ID = new OptionID("p3c.alpha", "The significance level for uniform testing in the initial binning step.");
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ public static final OptionID POISSON_THRESHOLD_ID = new OptionID("p3c.threshold", "The threshold value for the poisson test used when merging signatures.");
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ public static final OptionID MAX_EM_ITERATIONS_ID = new OptionID("p3c.em.maxiter", "The maximum number of iterations for the EM step. Use -1 to run until delta convergence.");
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ public static final OptionID EM_DELTA_ID = new OptionID("p3c.em.delta", "The change delta for the EM step below which to stop.");
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ public static final OptionID MIN_CLUSTER_SIZE_ID = new OptionID("p3c.minsize", "The minimum size of a cluster, otherwise it is seen as noise (this is a cheat, it is not mentioned in the paper).");
+
+ /**
+ * Parameter for the chi squared test threshold.
+ *
+ * While statistical values such as 0.01 are a good choice, we found the
+ * need to modify this parameter in our experiments.
+ */
+ protected double alpha;
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_THRESHOLD_ID, .001);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(POISSON_THRESHOLD_ID, 1.e-4);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ poissonThreshold = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MAX_EM_ITERATIONS_ID, 20);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_MINUSONE_INT);
+ if(config.grab(param)) {
+ maxEmIterations = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(EM_DELTA_ID, 1.e-5);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ emDelta = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MIN_CLUSTER_SIZE_ID, 1);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ minClusterSize = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected P3C<V> makeInstance() {
+ return new P3C<>(alpha, poissonThreshold, maxEmIterations, emDelta, minClusterSize);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 92158734..03e9978f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -67,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -148,7 +148,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); - final Random random = rnd.getRandom(); + final Random random = rnd.getSingleThreadedRandom(); if (RelationUtil.dimensionality(relation) < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")"); @@ -844,7 +844,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster configL(config); IntParameter m_iP = new IntParameter(M_I_ID, 10); - m_iP.addConstraint(new GreaterConstraint(0)); + m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(m_iP)) { m_i = m_iP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index c8d0833e..e6245f6e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -54,7 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -77,7 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Elke Achtert * * @apiviz.uses DBSCAN - * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction + * @apiviz.uses DimensionSelectingSubspaceDistanceFunction * @apiviz.has SubspaceModel * * @param <V> the type of FeatureVector handled by this Algorithm @@ -488,7 +488,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } IntParameter minptsP = new IntParameter(MINPTS_ID); - minptsP.addConstraint(new GreaterConstraint(0)); + minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if (config.grab(minptsP)) { minpts = minptsP.getValue(); } |