diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
46 files changed, 2664 insertions, 402 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index ea441655..670a3f0f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -27,7 +27,6 @@ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.PROCLUS; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; @@ -49,7 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * @param <R> the result we return * @param <V> the type of FeatureVector handled by this Algorithm */ -public abstract class AbstractProjectedClustering<R extends Clustering<Model>, V extends NumberVector<V, ?>> extends AbstractAlgorithm<R> implements ClusteringAlgorithm<R> { +public abstract class AbstractProjectedClustering<R extends Clustering<?>, V extends NumberVector<V, ?>> extends AbstractAlgorithm<R> implements ClusteringAlgorithm<R> { /** * Parameter to specify the number of clusters to find, must be an integer * greater than 0. diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index 108ba0ed..250cc70b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -166,7 +167,14 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext this.lambda = lambda; } - public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException { + /** + * Run the algorithm + * + * @param database Database to process + * @param relation Relation to process + * @return Clustering result + */ + public Clustering<Model> run(Database database, Relation<V> relation) { FiniteProgress objprog = getLogger().isVerbose() ? new FiniteProgress("Processing objects", relation.size(), getLogger()) : null; IndefiniteProgress clusprog = getLogger().isVerbose() ? new IndefiniteProgress("Number of clusters", getLogger()) : null; resultList = new ArrayList<ModifiableDBIDs>(); @@ -177,9 +185,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); if(relation.size() >= minpts) { - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { - expandCluster(distFunc, rangeQuery, id, objprog, clusprog); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { + expandCluster(distFunc, rangeQuery, iditer.getDBID(), objprog, clusprog); if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } @@ -191,8 +199,8 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } } else { - for(DBID id : relation.iterDBIDs()) { - noise.add(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + noise.add(iditer); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); clusprog.setProcessed(resultList.size(), getLogger()); @@ -284,28 +292,26 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DistanceResultPair<DoubleDistance> seed : seeds) { - DBID nextID = seed.getDBID(); - - Integer nextID_corrDim = distFunc.getIndex().getLocalProjection(nextID).getCorrelationDimension(); + int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension(); // nextID is not reachable from start object if(nextID_corrDim > lambda) { continue; } - if(!processedIDs.contains(nextID)) { - currentCluster.add(nextID); - processedIDs.add(nextID); + if(!processedIDs.contains(seed)) { + currentCluster.add(seed); + processedIDs.add(seed); } - else if(noise.contains(nextID)) { - currentCluster.add(nextID); - noise.remove(nextID); + else if(noise.contains(seed)) { + currentCluster.add(seed); + noise.remove(seed); } } seeds.remove(0); while(seeds.size() > 0) { - DBID q = seeds.remove(0).getDBID(); - Integer corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension(); + DistanceResultPair<DoubleDistance> q = seeds.remove(0); + int corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension(); // q forms no lambda-dim hyperplane if(corrDim_q > lambda) { continue; @@ -314,22 +320,22 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext List<DistanceResultPair<DoubleDistance>> reachables = rangeQuery.getRangeForDBID(q, epsilon); if(reachables.size() > minpts) { for(DistanceResultPair<DoubleDistance> r : reachables) { - Integer corrDim_r = distFunc.getIndex().getLocalProjection(r.getDBID()).getCorrelationDimension(); + int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension(); // r is not reachable from q if(corrDim_r > lambda) { continue; } - boolean inNoise = noise.contains(r.getDBID()); - boolean unclassified = !processedIDs.contains(r.getDBID()); + boolean inNoise = noise.contains(r); + boolean unclassified = !processedIDs.contains(r); if(inNoise || unclassified) { if(unclassified) { seeds.add(r); } - currentCluster.add(r.getDBID()); - processedIDs.add(r.getDBID()); + currentCluster.add(r); + processedIDs.add(r); if(inNoise) { - noise.remove(r.getDBID()); + noise.remove(r); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); @@ -349,9 +355,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext resultList.add(currentCluster); } else { - for(DBID id : currentCluster) { - noise.add(id); - } + noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java index 5ec59777..8f637460 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java @@ -47,5 +47,5 @@ import de.lmu.ifi.dbs.elki.database.Database; */ public interface ClusteringAlgorithm<C extends Clustering<? extends Model>> extends Algorithm { @Override - C run(Database database) throws IllegalStateException; + C run(Database database); }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index b59af555..6bafa9e9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -141,9 +142,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(size); if(size >= minpts) { - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { - expandCluster(relation, rangeQuery, id, objprog, clusprog); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { + expandCluster(relation, rangeQuery, iditer.getDBID(), objprog, clusprog); } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), logger); @@ -155,8 +156,8 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor } } else { - for(DBID id : relation.iterDBIDs()) { - noise.add(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + noise.add(iditer); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), logger); clusprog.setProcessed(resultList.size(), logger); @@ -210,35 +211,33 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DistanceResultPair<D> seed : seeds) { - DBID nextID = seed.getDBID(); - if(!processedIDs.contains(nextID)) { - currentCluster.add(nextID); - processedIDs.add(nextID); + if(!processedIDs.contains(seed)) { + currentCluster.add(seed); + processedIDs.add(seed); } - else if(noise.contains(nextID)) { - currentCluster.add(nextID); - noise.remove(nextID); + else if(noise.contains(seed)) { + currentCluster.add(seed); + noise.remove(seed); } } seeds.remove(0); while(seeds.size() > 0) { - DBID o = seeds.remove(0).getDBID(); + DistanceResultPair<D> o = seeds.remove(0); List<DistanceResultPair<D>> neighborhood = rangeQuery.getRangeForDBID(o, epsilon); if(neighborhood.size() >= minpts) { for(DistanceResultPair<D> neighbor : neighborhood) { - DBID p = neighbor.getDBID(); - boolean inNoise = noise.contains(p); - boolean unclassified = !processedIDs.contains(p); + boolean inNoise = noise.contains(neighbor); + boolean unclassified = !processedIDs.contains(neighbor); if(inNoise || unclassified) { if(unclassified) { seeds.add(neighbor); } - currentCluster.add(p); - processedIDs.add(p); + currentCluster.add(neighbor); + processedIDs.add(neighbor); if(inNoise) { - noise.remove(p); + noise.remove(neighbor); } } } @@ -258,9 +257,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor resultList.add(currentCluster); } else { - for(DBID id : currentCluster) { - noise.add(id); - } + noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index f1e6c945..a0780e3d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; */ import java.util.Collection; -import java.util.Iterator; import java.util.List; import java.util.Set; @@ -36,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; @@ -201,11 +201,11 @@ public class DeLiClu<NV extends NumberVector<NV, ?>, D extends Distance<D>> exte * @return the id of the start object for the run method */ private DBID getStartObject(Relation<NV> relation) { - Iterator<DBID> it = relation.iterDBIDs(); - if(!it.hasNext()) { + DBIDIter it = relation.iterDBIDs(); + if(!it.valid()) { return null; } - return it.next(); + return it.getDBID(); } /** diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index a70a3f6f..63ebbabb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -39,7 +39,8 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -170,28 +171,31 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri if(logger.isVerbose()) { logger.verbose("initializing " + k + " models"); } - List<Vector> means = initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC); + List<Vector> means = new ArrayList<Vector>(); + for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC)) { + means.add(nv.getColumnVector()); + } List<Matrix> covarianceMatrices = new ArrayList<Matrix>(k); - List<Double> normDistrFactor = new ArrayList<Double>(k); + double[] normDistrFactor = new double[k]; List<Matrix> invCovMatr = new ArrayList<Matrix>(k); - List<Double> clusterWeights = new ArrayList<Double>(k); + double[] clusterWeights = new double[k]; probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); final int dimensionality = means.get(0).getDimensionality(); for(int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); covarianceMatrices.add(m); - normDistrFactor.add(1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det())); + normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det()); invCovMatr.add(m.inverse()); - clusterWeights.add(1.0 / k); + clusterWeights[i] = 1.0 / k; if(logger.isDebuggingFinest()) { StringBuffer msg = new StringBuffer(); msg.append(" model ").append(i).append(":\n"); msg.append(" mean: ").append(means.get(i)).append("\n"); msg.append(" m:\n").append(FormatUtil.format(m, " ")).append("\n"); msg.append(" m.det(): ").append(m.det()).append("\n"); - msg.append(" cluster weight: ").append(clusterWeights.get(i)).append("\n"); - msg.append(" normDistFact: ").append(normDistrFactor.get(i)).append("\n"); + msg.append(" cluster weight: ").append(clusterWeights[i]).append("\n"); + msg.append(" normDistFact: ").append(normDistrFactor[i]).append("\n"); logger.debugFine(msg.toString()); } } @@ -216,31 +220,31 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri double[] sumOfClusterProbabilities = new double[k]; for(int i = 0; i < k; i++) { - clusterWeights.set(i, 0.0); + clusterWeights[i] = 0.0; meanSums.add(new Vector(dimensionality)); covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality)); } // weights and means - for(DBID id : relation.iterDBIDs()) { - double[] clusterProbabilities = probClusterIGivenX.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); for(int i = 0; i < k; i++) { sumOfClusterProbabilities[i] += clusterProbabilities[i]; - Vector summand = relation.get(id).getColumnVector().timesEquals(clusterProbabilities[i]); + Vector summand = relation.get(iditer).getColumnVector().timesEquals(clusterProbabilities[i]); meanSums.get(i).plusEquals(summand); } } final int n = relation.size(); for(int i = 0; i < k; i++) { - clusterWeights.set(i, sumOfClusterProbabilities[i] / n); + clusterWeights[i] = sumOfClusterProbabilities[i] / n; Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]); means.set(i, newMean); } // covariance matrices - for(DBID id : relation.iterDBIDs()) { - double[] clusterProbabilities = probClusterIGivenX.get(id); - Vector instance = relation.get(id).getColumnVector(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); + Vector instance = relation.get(iditer).getColumnVector(); for(int i = 0; i < k; i++) { Vector difference = instance.minus(means.get(i)); covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i])); @@ -250,7 +254,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT)); } for(int i = 0; i < k; i++) { - normDistrFactor.set(i, 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * covarianceMatrices.get(i).det())); + normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * covarianceMatrices.get(i).det()); invCovMatr.set(i, covarianceMatrices.get(i).inverse()); } // reassign probabilities @@ -269,8 +273,8 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri } // provide a hard clustering - for(DBID id : relation.iterDBIDs()) { - double[] clusterProbabilities = probClusterIGivenX.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + double[] clusterProbabilities = probClusterIGivenX.get(iditer); int maxIndex = 0; double currentMax = 0.0; for(int i = 0; i < k; i++) { @@ -279,7 +283,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri currentMax = clusterProbabilities[i]; } } - hardClusters.get(maxIndex).add(id); + hardClusters.get(maxIndex).add(iditer); } final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); Clustering<EMModel<V>> result = new Clustering<EMModel<V>>("EM Clustering", "em-clustering"); @@ -309,25 +313,25 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri * @param clusterWeights the weights of the current clusters * @return the expectation value of the current mixture of distributions */ - protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { + protected double assignProbabilitiesToInstances(Relation<V> database, double[] normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) { double emSum = 0.0; - for(DBID id : database.iterDBIDs()) { - Vector x = database.get(id).getColumnVector(); - List<Double> probabilities = new ArrayList<Double>(k); + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + Vector x = database.get(iditer).getColumnVector(); + double[] probabilities = new double[k]; for(int i = 0; i < k; i++) { Vector difference = x.minus(means.get(i)); double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference); double power = rowTimesCovTimesCol / 2.0; - double prob = normDistrFactor.get(i) * Math.exp(-power); + double prob = normDistrFactor[i] * Math.exp(-power); if(logger.isDebuggingFinest()) { logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); } - probabilities.add(prob); + probabilities[i] = prob; } double priorProbability = 0.0; for(int i = 0; i < k; i++) { - priorProbability += probabilities.get(i) * clusterWeights.get(i); + priorProbability += probabilities[i] * clusterWeights[i]; } double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD); if(!Double.isNaN(logP)) { @@ -337,16 +341,16 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri double[] clusterProbabilities = new double[k]; for(int i = 0; i < k; i++) { assert (priorProbability >= 0.0); - assert (clusterWeights.get(i) >= 0.0); + assert (clusterWeights[i] >= 0.0); // do not divide by zero! if(priorProbability == 0.0) { clusterProbabilities[i] = 0.0; } else { - clusterProbabilities[i] = probabilities.get(i) / priorProbability * clusterWeights.get(i); + clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i]; } } - probClusterIGivenX.put(id, clusterProbabilities); + probClusterIGivenX.put(iditer, clusterProbabilities); } return emSum; @@ -358,7 +362,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri * @param index Point ID * @return Probabilities of given point */ - public double[] getProbClusterIGivenX(DBID index) { + public double[] getProbClusterIGivenX(DBIDRef index) { return probClusterIGivenX.get(index); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java index 2244b07b..04b57081 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java @@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -142,22 +143,22 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && DoubleDistance.class.isInstance(epsilon)) { // Optimized codepath for double-based distances. Avoids Java // boxing/unboxing. - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { // We need to do some ugly casts to be able to run the optimized version, unfortunately. @SuppressWarnings("unchecked") final ClusterOrderResult<DoubleDistance> doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder); @SuppressWarnings("unchecked") final RangeQuery<O, DoubleDistance> doubleRangeQuery = RangeQuery.class.cast(rangeQuery); final DoubleDistance depsilon = DoubleDistance.class.cast(epsilon); - expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, id, depsilon, progress); + expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, iditer.getDBID(), depsilon, progress); } } } else { - for(DBID id : relation.iterDBIDs()) { - if(!processedIDs.contains(id)) { - expandClusterOrder(clusterOrder, database, rangeQuery, id, epsilon, progress); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + if(!processedIDs.contains(iditer)) { + expandClusterOrder(clusterOrder, database, rangeQuery, iditer.getDBID(), epsilon, progress); } } } @@ -194,7 +195,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor D coreDistance = last.getDistance(); for(DistanceResultPair<D> neighbor : neighbors) { - if(processedIDs.contains(neighbor.getDBID())) { + if(processedIDs.contains(neighbor)) { continue; } D reachability = DistanceUtil.max(neighbor.getDistance(), coreDistance); @@ -234,7 +235,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor double coreDistance = ((DoubleDistanceResultPair) last).getDoubleDistance(); for(DistanceResultPair<DoubleDistance> neighbor : neighbors) { - if(processedIDs.contains(neighbor.getDBID())) { + if(processedIDs.contains(neighbor)) { continue; } double reachability = Math.max(((DoubleDistanceResultPair) neighbor).getDoubleDistance(), coreDistance); @@ -247,7 +248,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor double coreDistance = last.getDistance().doubleValue(); for(DistanceResultPair<DoubleDistance> neighbor : neighbors) { - if(processedIDs.contains(neighbor.getDBID())) { + if(processedIDs.contains(neighbor)) { continue; } double reachability = Math.max(neighbor.getDistance().doubleValue(), coreDistance); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java index d6c5872a..3ead6f3e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java @@ -39,7 +39,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; */ public interface OPTICSTypeAlgorithm<D extends Distance<D>> extends Algorithm { @Override - ClusterOrderResult<D> run(Database database) throws IllegalStateException; + ClusterOrderResult<D> run(Database database); /** * Get the minpts value used. Needed for OPTICS Xi etc. diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java index 45b12c43..2aa38bdd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java @@ -27,7 +27,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -49,6 +48,8 @@ import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -144,7 +145,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size()); // apply the algorithm - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); step1(id); step2(id, processedIDs, distQuery, m); step3(id, processedIDs, m); @@ -200,7 +202,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * @param distFunc Distance function to use */ private void step2(DBID newID, DBIDs processedIDs, DistanceQuery<O, D> distFunc, WritableDataStore<D> m) { - for(DBID id : processedIDs) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); // M(i) = dist(i, n+1) m.put(id, distFunc.distance(id, newID)); } @@ -215,7 +218,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori */ private void step3(DBID newID, DBIDs processedIDs, WritableDataStore<D> m) { // for i = 1..n - for(DBID id : processedIDs) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); D l_i = lambda.get(id); D m_i = m.get(id); DBID p_i = pi.get(id); @@ -247,7 +251,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori */ private void step4(DBID newID, DBIDs processedIDs) { // for i = 1..n - for(DBID id : processedIDs) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); D l_i = lambda.get(id); D lp_i = lambda.get(pi.get(id)); @@ -303,7 +308,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori // extract the child clusters Map<DBID, ModifiableDBIDs> cluster_ids = new HashMap<DBID, ModifiableDBIDs>(); Map<DBID, D> cluster_distances = new HashMap<DBID, D>(); - for(DBID id : ids) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { + DBID id = it.getDBID(); DBID lastObjectInCluster = lastObjectInCluster(id, stopdist, pi, lambda); ModifiableDBIDs cluster = cluster_ids.get(lastObjectInCluster); if(cluster == null) { @@ -387,7 +393,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori } // right child DBID rightID = pi.get(leftID); - if(leftID.equals(rightID)) { + if(leftID.sameDBID(rightID)) { break; } Cluster<DendrogramModel<D>> right = nodes.get(rightID); @@ -472,11 +478,12 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), logger) : null; - for(DBID cur : order) { - DBID dest = pi.get(cur); - D l = lambda.get(cur); + for(DBIDIter it = order.iter(); it.valid(); it.advance()) { + DBID dest = pi.get(it); + D l = lambda.get(it); // logger.debugFine("DBID " + cur.toString() + " dist: " + l.toString()); if(stopdist != null && stopdist.compareTo(l) > 0) { + DBID cur = it.getDBID(); ModifiableDBIDs curset = cids.remove(cur); ModifiableDBIDs destset = cids.get(dest); if(destset == null) { @@ -511,13 +518,11 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori Cluster<Model> cluster = new Cluster<Model>(cname, clusids, ClusterModel.CLUSTER, hier); // Collect child clusters and clean up the cluster ids, keeping only // "new" objects. - Iterator<DBID> iter = clusids.iterator(); - while(iter.hasNext()) { - DBID child = iter.next(); - Cluster<Model> chiclus = clusters.get(child); + for(DBIDMIter iter = clusids.iter(); iter.valid(); iter.advance()) { + Cluster<Model> chiclus = clusters.get(iter); if(chiclus != null) { hier.add(cluster, chiclus); - clusters.remove(child); + clusters.remove(iter); iter.remove(); } } @@ -545,7 +550,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori cids.put(dest, destset); destset.add(dest); } - destset.add(cur); + destset.add(it); } } // Decrement counter diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index 7c3a13c9..ae612b2a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; @@ -154,9 +155,9 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(relation.size()); if(relation.size() >= minpts) { - for(DBID id : snnInstance.getRelation().iterDBIDs()) { + for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) { if(!processedIDs.contains(id)) { - expandCluster(snnInstance, id, objprog, clusprog); + expandCluster(snnInstance, id.getDBID(), objprog, clusprog); if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } @@ -168,7 +169,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple } } else { - for(DBID id : snnInstance.getRelation().iterDBIDs()) { + for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) { noise.add(id); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), logger); @@ -202,9 +203,9 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple */ protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) { ArrayModifiableDBIDs neighbors = DBIDUtil.newArray(); - for(DBID id : snnInstance.getRelation().iterDBIDs()) { - if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) { - neighbors.add(id); + for(DBIDIter iditer = snnInstance.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + if(snnInstance.similarity(queryObject, iditer).compareTo(epsilon) >= 0) { + neighbors.add(iditer); } } return neighbors; @@ -237,7 +238,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); - for(DBID seed : seeds) { + for(DBIDIter seed = seeds.iter(); seed.valid(); seed.advance()) { if(!processedIDs.contains(seed)) { currentCluster.add(seed); processedIDs.add(seed); @@ -253,7 +254,8 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o); if(neighborhood.size() >= minpts) { - for(DBID p : neighborhood) { + for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + DBID p = iter.getDBID(); boolean inNoise = noise.contains(p); boolean unclassified = !processedIDs.contains(p); if(inNoise || unclassified) { @@ -283,9 +285,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple resultList.add(currentCluster); } else { - for(DBID id : currentCluster) { - noise.add(id); - } + noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index b877415e..e4c6a123 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -48,7 +48,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.QueryUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -86,9 +86,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * Provides the CASH algorithm, an subspace clustering algorithm based on the * Hough transform. * - * <b>Note:</b> CASH requires explicitly setting the input parser other than default to - * {@link de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser}: - * (in the MiniGui, set option: dbc.parser ParameterizationFunctionLabelParser). + * <b>Note:</b> CASH requires explicitly setting the input vector type to + * {@link ParameterizationFunction}: + * (in the MiniGui, set option: parser.vector-type ParameterizationFunction). * * <p> * Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust @@ -503,9 +503,9 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste proxy.addRelation(prep); // Project - for(DBID id : ids) { - ParameterizationFunction f = project(basis, relation.get(id)); - prep.set(id, f); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + ParameterizationFunction f = project(basis, relation.get(iter)); + prep.set(iter, f); } if(logger.isDebugging()) { @@ -662,8 +662,8 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste double d_min = Double.POSITIVE_INFINITY; double d_max = Double.NEGATIVE_INFINITY; - for(DBID id : relation.iterDBIDs()) { - ParameterizationFunction f = relation.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + ParameterizationFunction f = relation.get(iditer); HyperBoundingBox minMax = f.determineAlphaMinMax(box); double f_min = f.function(SpatialUtil.getMin(minMax)); double f_max = f.function(SpatialUtil.getMax(minMax)); @@ -709,11 +709,11 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste ids.addDBIDs(interval.getIDs()); // Search for nearby vectors in original database - for(DBID id : relation.iterDBIDs()) { - DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef()); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DoubleVector v = new DoubleVector(relation.get(iditer).getColumnVector().getArrayRef()); DoubleDistance d = df.distance(v, centroid); if(d.compareTo(eps) < 0) { - ids.add(id); + ids.add(iditer); } } @@ -735,15 +735,15 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, CASHInterval interval) throws UnableToComplyException { DBIDs ids = interval.getIDs(); ProxyDatabase proxy = new ProxyDatabase(ids); - int dim = relation.get(ids.iterator().next()).getDimensionality(); + int dim = DatabaseUtil.dimensionality(relation); SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dim, new DoubleVector(new double[dim])); MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids); proxy.addRelation(prep); // Project - for(DBID id : ids) { - DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef()); - prep.set(id, v); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef()); + prep.set(iter, v); } if(logger.isDebugging()) { @@ -800,15 +800,15 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste */ private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, DBIDs ids) throws UnableToComplyException { ProxyDatabase proxy = new ProxyDatabase(ids); - int dim = relation.get(ids.iterator().next()).getDimensionality(); + int dim = DatabaseUtil.dimensionality(relation); SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dim, new DoubleVector(new double[dim])); MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids); proxy.addRelation(prep); // Project - for(DBID id : ids) { - DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef()); - prep.set(id, v); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef()); + prep.set(iter, v); } return proxy; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index 575bf117..1d41d37e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -176,7 +177,7 @@ public class COPAC<V extends NumberVector<V, ?>, D extends Distance<D>> extends * @return Clustering result */ @SuppressWarnings("unchecked") - public Clustering<Model> run(Relation<V> relation) throws IllegalStateException { + public Clustering<Model> run(Relation<V> relation) { if(logger.isVerbose()) { logger.verbose("Running COPAC on db size = " + relation.size() + " with dimensionality = " + DatabaseUtil.dimensionality(relation)); } @@ -189,14 +190,14 @@ public class COPAC<V extends NumberVector<V, ?>, D extends Distance<D>> extends FiniteProgress partitionProgress = logger.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), logger) : null; int processed = 1; - for(DBID id : relation.iterDBIDs()) { - Integer corrdim = preprocin.getLocalProjection(id).getCorrelationDimension(); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + int corrdim = preprocin.getLocalProjection(iditer).getCorrelationDimension(); if(!partitionMap.containsKey(corrdim)) { partitionMap.put(corrdim, DBIDUtil.newArray()); } - partitionMap.get(corrdim).add(id); + partitionMap.get(corrdim).add(iditer); if(partitionProgress != null) { partitionProgress.setProcessed(processed++, logger); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java index af4f677f..b57a6e29 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java @@ -118,7 +118,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste * @param relation Relation to process * @return Clustering result */ - public Clustering<CorrelationModel<V>> run(Relation<V> relation) throws IllegalStateException { + public Clustering<CorrelationModel<V>> run(Relation<V> relation) { final int dimensionality = DatabaseUtil.dimensionality(relation); StepProgress stepprog = logger.isVerbose() ? new StepProgress(3) : null; @@ -291,7 +291,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste return parameters; } - private void buildHierarchy(SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) throws IllegalStateException { + private void buildHierarchy(SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) { StringBuffer msg = new StringBuffer(); DBSCAN<V, DoubleDistance> dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query)); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java index 41ee1f69..b8942de8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -175,9 +175,9 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { break; } ModifiableDBIDs subset = DBIDUtil.newArray(current.size()); - for(DBID id : current) { - if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { - subset.add(id); + for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) { + if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) { + subset.add(iter); } } // logger.verbose("size:"+subset.size()); @@ -265,16 +265,16 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { int remaining_retries = 100; for(int i = 1; i <= samples; i++) { DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong()); - final Iterator<DBID> iter = sample.iterator(); + final DBIDIter iter = sample.iter(); // Use first as origin - DBID origin = iter.next(); - Vector originV = relation.get(origin).getColumnVector(); + Vector originV = relation.get(iter).getColumnVector(); + iter.advance(); // Build orthogonal basis from remainder Matrix basis; { List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1); - while(iter.hasNext()) { - Vector vec = relation.get(iter.next()).getColumnVector(); + for(;iter.valid(); iter.advance()) { + Vector vec = relation.get(iter).getColumnVector(); vectors.add(vec.minusEquals(originV)); } // generate orthogonal basis @@ -292,12 +292,12 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { // Generate and fill a histogram. FlexiHistogram<Double, Double> histogram = FlexiHistogram.DoubleSumHistogram(BINS); double w = 1.0 / currentids.size(); - for(DBID point : currentids) { + for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) { // Skip sampled points - if(sample.contains(point)) { + if(sample.contains(iter2)) { continue; } - Vector vec = relation.get(point).getColumnVector().minusEquals(originV); + Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV); final double distance = deviation(vec, basis); histogram.aggregate(distance, w); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index eb5608fc..2e9f4a9b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -139,8 +140,11 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust /** * Performs the ORCLUS algorithm on the given database. + * + * @param database Database + * @param relation Relation */ - public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<Model> run(Database database, Relation<V> relation) { try { DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); // current dimensionality associated with each seed @@ -211,8 +215,8 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, seed); V factory = DatabaseUtil.assumeVectorField(database).getFactory(); List<ORCLUSCluster> seeds = new ArrayList<ORCLUSCluster>(); - for(DBID id : randomSample) { - seeds.add(new ORCLUSCluster(database.get(id), id, factory)); + for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { + seeds.add(new ORCLUSCluster(database.get(iter), iter.getDBID(), factory)); } return seeds; } @@ -240,10 +244,8 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust } // for each data point o do - Iterator<DBID> it = database.iterDBIDs(); - while(it.hasNext()) { - DBID id = it.next(); - V o = database.get(id); + for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V o = database.get(it); DoubleDistance minDist = null; ORCLUSCluster minCluster = null; @@ -260,7 +262,7 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust } // add p to the cluster with the least value of projected distance assert minCluster != null; - minCluster.objectIDs.add(id); + minCluster.objectIDs.add(it); } // recompute the seed in each clusters @@ -285,10 +287,9 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust // covariance matrix of cluster // Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs); List<DistanceResultPair<DoubleDistance>> results = new ArrayList<DistanceResultPair<DoubleDistance>>(cluster.objectIDs.size()); - for(Iterator<DBID> it = cluster.objectIDs.iterator(); it.hasNext();) { - DBID id = it.next(); - DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(id)); - DistanceResultPair<DoubleDistance> qr = new GenericDistanceResultPair<DoubleDistance>(distance, id); + for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { + DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it)); + DistanceResultPair<DoubleDistance> qr = new GenericDistanceResultPair<DoubleDistance>(distance, it.getDBID()); results.add(qr); } Collections.sort(results); @@ -407,9 +408,8 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust DoubleDistance sum = getDistanceFunction().getDistanceFactory().nullDistance(); V c_proj = projection(c_ij, c_ij.centroid, factory); - for(DBID id : c_ij.objectIDs) { - V o = database.get(id); - V o_proj = projection(c_ij, o, factory); + for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { + V o_proj = projection(c_ij, database.get(iter), factory); DoubleDistance dist = distFunc.distance(o_proj, c_proj); sum = sum.plus(dist.times(dist)); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java index 86e045cb..b0a12832 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java @@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.HyperBoundingBox; import de.lmu.ifi.dbs.elki.data.ParameterizationFunction; import de.lmu.ifi.dbs.elki.data.spatial.SpatialUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -114,7 +115,8 @@ public class CASHIntervalSplit { f_maxima.put(interval, maxima); } - for(DBID id : superSetIDs) { + for(DBIDIter iter = superSetIDs.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); Double f_min = minima.get(id); Double f_max = maxima.get(id); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java new file mode 100644 index 00000000..e75a89dc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java @@ -0,0 +1,80 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; + +/** + * Predicate for GeneralizedDBSCAN to evaluate whether a point is a core point + * or not. + * + * Note the Factory/Instance split of this interface. + * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +public interface CorePredicate { + /** + * Constant for the generic type {@code List<? extends DistanceResultPair<?>>} + */ + public static final String NEIGHBOR_LIST = "neighborhood-list"; + + /** + * Instantiate for a database. + * + * @param database Database to instantiate for + * @param type Type to instantiate for + * @return Instance + */ + public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type); + + /** + * Test whether the neighborhood type T is accepted by this predicate. + * + * @param type Type information + * @return true when the type is accepted + */ + public boolean acceptsType(SimpleTypeInformation<?> type); + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + * + * @param <T> actual type + */ + public static interface Instance<T> { + /** + * Decide whether the point is a core point, based on its neighborhood. + * + * @param point Query point + * @param neighbors Neighbors + * @return core point property + */ + public boolean isCorePoint(DBIDRef point, T neighbors); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java new file mode 100644 index 00000000..cb24e8f1 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java @@ -0,0 +1,268 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.DistanceDBIDResult; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * The default DBSCAN and OPTICS neighbor predicate, using an + * epsilon-neighborhood. + * + * <p> + * Reference: <br> + * M. Ester, H.-P. Kriegel, J. Sander, and X. Xu: A Density-Based Algorithm for + * Discovering Clusters in Large Spatial Databases with Noise. <br> + * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), + * Portland, OR, 1996. + * </p> + * + * @author Erich Schubert + * + * @param <D> Distance type + */ +@Reference(authors = "M. Ester, H.-P. Kriegel, J. Sander, and X. Xu", title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.71.1980") +public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements NeighborPredicate { + /** + * Range to query with + */ + D epsilon; + + /** + * Distance function to use + */ + DistanceFunction<O, D> distFunc; + + /** + * Full constructor. + * + * @param epsilon Epsilon value + * @param distFunc Distance function to use + */ + public EpsilonNeighborPredicate(D epsilon, DistanceFunction<O, D> distFunc) { + super(); + this.epsilon = epsilon; + this.distFunc = distFunc; + } + + @SuppressWarnings("unchecked") + @Override + public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type) { + if(TypeUtil.DBIDS.isAssignableFromType(type)) { + DistanceQuery<O, D> dq = QueryUtil.getDistanceQuery(database, distFunc); + RangeQuery<O, D> rq = database.getRangeQuery(dq); + return (Instance<T>) new DBIDInstance<D>(epsilon, rq, dq.getRelation().getDBIDs()); + } + if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) { + DistanceQuery<O, D> dq = QueryUtil.getDistanceQuery(database, distFunc); + RangeQuery<O, D> rq = database.getRangeQuery(dq); + return (Instance<T>) new NeighborListInstance<D>(epsilon, rq, dq.getRelation().getDBIDs()); + } + throw new AbortException("Incompatible predicate types"); + } + + @Override + public SimpleTypeInformation<?>[] getOutputType() { + return new SimpleTypeInformation<?>[] { TypeUtil.DBIDS, TypeUtil.NEIGHBORLIST }; + } + + @Override + public TypeInformation getInputTypeRestriction() { + return distFunc.getInputTypeRestriction(); + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class DBIDInstance<D extends Distance<D>> implements NeighborPredicate.Instance<DBIDs> { + /** + * Range to query with + */ + D epsilon; + + /** + * Range query to use on the database. + */ + RangeQuery<?, D> rq; + + /** + * DBIDs to process + */ + DBIDs ids; + + /** + * Constructor. + * + * @param epsilon Epsilon + * @param rq Range query to use + * @param ids DBIDs to process + */ + public DBIDInstance(D epsilon, RangeQuery<?, D> rq, DBIDs ids) { + super(); + this.epsilon = epsilon; + this.rq = rq; + this.ids = ids; + } + + @Override + public DBIDs getIDs() { + return ids; + } + + @Override + public DBIDs getNeighbors(DBIDRef reference) { + List<DistanceResultPair<D>> res = rq.getRangeForDBID(reference, epsilon); + // Throw away the actual distance values ... + ModifiableDBIDs neighbors = DBIDUtil.newHashSet(res.size()); + for(DistanceResultPair<D> dr : res) { + neighbors.add(dr); + } + return neighbors; + } + + @Override + public void addDBIDs(ModifiableDBIDs ids, DBIDs neighbors) { + ids.addDBIDs(neighbors); + } + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class NeighborListInstance<D extends Distance<D>> implements NeighborPredicate.Instance<DistanceDBIDResult<D>> { + /** + * Range to query with + */ + D epsilon; + + /** + * Range query to use on the database. + */ + RangeQuery<?, D> rq; + + /** + * DBIDs to process + */ + DBIDs ids; + + /** + * Constructor. + * + * @param epsilon Epsilon + * @param rq Range query to use + * @param ids DBIDs to process + */ + public NeighborListInstance(D epsilon, RangeQuery<?, D> rq, DBIDs ids) { + super(); + this.epsilon = epsilon; + this.rq = rq; + this.ids = ids; + } + + @Override + public DBIDs getIDs() { + return ids; + } + + @Override + public DistanceDBIDResult<D> getNeighbors(DBIDRef reference) { + return rq.getRangeForDBID(reference, epsilon); + } + + @Override + public void addDBIDs(ModifiableDBIDs ids, DistanceDBIDResult<D> neighbors) { + for(DistanceResultPair<D> neighbor : neighbors) { + ids.add(neighbor); + } + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends Distance<D>> extends AbstractParameterizer { + /** + * Range to query with + */ + D epsilon; + + /** + * Distance function to use + */ + DistanceFunction<O, D> distfun = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Get a distance function. + ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<DistanceFunction<O, D>>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + D distanceFactory = null; + if(config.grab(distanceP)) { + distfun = distanceP.instantiateClass(config); + distanceFactory = distfun.getDistanceFactory(); + } + // Get the epsilon parameter + DistanceParameter<D> epsilonP = new DistanceParameter<D>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory); + if(config.grab(epsilonP)) { + epsilon = epsilonP.getValue(); + } + } + + @Override + protected EpsilonNeighborPredicate<O, D> makeInstance() { + return new EpsilonNeighborPredicate<O, D>(epsilon, distfun); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java new file mode 100644 index 00000000..2e1c2093 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java @@ -0,0 +1,323 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.list.array.TIntArrayList; + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Generalized DBSCAN, density-based clustering with noise. + * <p> + * Reference:<br /> + * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu:<br /> + * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its + * Applications<br /> + * In: Data Mining and Knowledge Discovery, 1998. + * </p> + * + * @author Erich Schubert + * @author Arthur Zimek + * + * @apiviz.has Instance + */ +@Reference(authors = "Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu", title = "Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications", booktitle = "Data Mining and Knowledge Discovery", url = "http://dx.doi.org/10.1023/A:1009745219419") +public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { + /** + * Get a logger for this algorithm + */ + final static Logging logger = Logging.getLogger(GeneralizedDBSCAN.class); + + /** + * The neighborhood predicate factory. + */ + NeighborPredicate npred; + + /** + * The core predicate factory. + */ + CorePredicate corepred; + + /** + * Constructor for parameterized algorithm. + * + * @param npred Neighbor predicate + * @param corepred Core point predicate + */ + public GeneralizedDBSCAN(NeighborPredicate npred, CorePredicate corepred) { + super(); + this.npred = npred; + this.corepred = corepred; + } + + @Override + public Clustering<Model> run(Database database) { + for (SimpleTypeInformation<?> t : npred.getOutputType()) { + if (corepred.acceptsType(t)) { + return new Instance<Object>(npred.instantiate(database, t), corepred.instantiate(database, t)).run(); + } + } + throw new AbortException("No compatible types found."); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(npred.getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public class Instance<T> { + /** + * The neighborhood predicate + */ + final NeighborPredicate.Instance<T> npred; + + /** + * The core object property + */ + final CorePredicate.Instance<T> corepred; + + /** + * Full Constructor + * + * @param npred Neighborhood predicate + * @param corepred Core object predicate + */ + public Instance(NeighborPredicate.Instance<T> npred, CorePredicate.Instance<T> corepred) { + super(); + this.npred = npred; + this.corepred = corepred; + } + + /** + * Run the actual DBSCAN algorithm. + * + * @return Clustering result + */ + public Clustering<Model> run() { + final DBIDs ids = npred.getIDs(); + // Setup progress logging + final FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", ids.size(), logger) : null; + final IndefiniteProgress clusprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters", logger) : null; + // (Temporary) store the cluster ID assigned. + final WritableIntegerDataStore clusterids = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -2); + // Note: these are not exact! + final TIntArrayList clustersizes = new TIntArrayList(); + + // Implementation Note: using Integer objects should result in + // reduced memory use in the HashMap! + final int noiseid = -1; + int clusterid = 0; + int clustersize = 0; + int noisesize = 0; + // Iterate over all objects in the database. + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { + // Skip already processed ids. + if(clusterids.intValue(id) > -2) { + continue; + } + // Evaluate Neighborhood predicate + final T neighbors = npred.getNeighbors(id); + // Evaluate Core-Point predicate: + if(corepred.isCorePoint(id, neighbors)) { + clusterids.putInt(id, clusterid); + clustersize = 1 + setbasedExpandCluster(clusterid, clusterids, neighbors, progress); + // start next cluster on next iteration. + clustersizes.add(clustersize); + clustersize = 0; + clusterid += 1; + if(clusprogress != null) { + clusprogress.setProcessed(clusterid, logger); + } + } + else { + // otherwise, it's a noise point + clusterids.putInt(id, noiseid); + noisesize += 1; + } + // We've completed this element + if(progress != null) { + progress.incrementProcessed(logger); + } + } + // Finish progress logging. + if(progress != null) { + progress.ensureCompleted(logger); + } + if(clusprogress != null) { + clusprogress.setCompleted(logger); + } + + // Transform cluster ID mapping into a clustering result: + ArrayList<ArrayModifiableDBIDs> clusterlists = new ArrayList<ArrayModifiableDBIDs>(clusterid + 1); + // add noise cluster storage + clusterlists.add(DBIDUtil.newArray(noisesize)); + // add storage containers for clusters + for(int i = 0; i < clustersizes.size(); i++) { + clusterlists.add(DBIDUtil.newArray(clustersizes.get(i))); + } + // do the actual inversion + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { + int cluster = clusterids.intValue(id); + clusterlists.get(cluster + 1).add(id); + } + clusterids.destroy(); + + Clustering<Model> result = new Clustering<Model>("GDBSCAN", "gdbscan-clustering"); + int cid = 0; + for(ArrayModifiableDBIDs res : clusterlists) { + boolean isNoise = (cid == 0); + Cluster<Model> c = new Cluster<Model>(res, isNoise, ClusterModel.CLUSTER); + result.addCluster(c); + cid++; + } + return result; + } + + /** + * Set-based expand cluster implementation. + * + * @param clusterid ID of the current cluster. + * @param clusterids Current object to cluster mapping. + * @param neighbors Neighbors acquired by initial getNeighbors call. + * @param progress Progress logging + * + * @return cluster size; + */ + protected int setbasedExpandCluster(final int clusterid, final WritableIntegerDataStore clusterids, final T neighbors, final FiniteProgress progress) { + int clustersize = 0; + final ArrayModifiableDBIDs activeSet = DBIDUtil.newArray(); + npred.addDBIDs(activeSet, neighbors); + // run expandCluster as long as this set is non-empty (non-recursive + // implementation) + while(!activeSet.isEmpty()) { + final DBID id = activeSet.remove(activeSet.size() - 1); + clustersize += 1; + // Assign object to cluster + final int oldclus = clusterids.putInt(id, clusterid); + if(oldclus == -2) { + // expandCluster again: + // Evaluate Neighborhood predicate + final T newneighbors = npred.getNeighbors(id); + // Evaluate Core-Point predicate + if(corepred.isCorePoint(id, newneighbors)) { + // Note: the recursion is unrolled into iteration over the active + // set. + npred.addDBIDs(activeSet, newneighbors); + } + if(progress != null) { + progress.incrementProcessed(logger); + } + } + } + return clustersize; + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Neighborhood predicate + */ + NeighborPredicate npred = null; + + /** + * Core point predicate + */ + CorePredicate corepred = null; + + /** + * Parameter for neighborhood predicate + */ + public final static OptionID NEIGHBORHOODPRED_ID = OptionID.getOrCreateOptionID("gdbscan.neighborhood", "Neighborhood predicate for GDBSCAN"); + + /** + * Parameter for core predicate + */ + public final static OptionID COREPRED_ID = OptionID.getOrCreateOptionID("gdbscan.core", "Core point predicate for GDBSCAN"); + + @Override + protected void makeOptions(Parameterization config) { + // Neighborhood predicate + ObjectParameter<NeighborPredicate> npredOpt = new ObjectParameter<NeighborPredicate>(NEIGHBORHOODPRED_ID, NeighborPredicate.class, EpsilonNeighborPredicate.class); + if(config.grab(npredOpt)) { + npred = npredOpt.instantiateClass(config); + } + + // Core point predicate + ObjectParameter<CorePredicate> corepredOpt = new ObjectParameter<CorePredicate>(COREPRED_ID, CorePredicate.class, MinPtsCorePredicate.class); + if(config.grab(corepredOpt)) { + corepred = corepredOpt.instantiateClass(config); + } + } + + @Override + protected GeneralizedDBSCAN makeInstance() { + return new GeneralizedDBSCAN(npred, corepred); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java new file mode 100644 index 00000000..b9852eca --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java @@ -0,0 +1,178 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * The DBSCAN default core point predicate -- having at least {@link #minpts} + * neighbors. + * + * <p> + * Reference: <br> + * M. Ester, H.-P. Kriegel, J. Sander, and X. Xu: A Density-Based Algorithm for + * Discovering Clusters in Large Spatial Databases with Noise. <br> + * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), + * Portland, OR, 1996. + * </p> + * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +@Reference(authors = "M. Ester, H.-P. Kriegel, J. Sander, and X. Xu", title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.71.1980") +public class MinPtsCorePredicate implements CorePredicate { + /** + * The minpts parameter. + */ + int minpts; + + /** + * Default constructor. + * + * @param minpts Minimum number of neighbors to be a core point. + */ + public MinPtsCorePredicate(int minpts) { + super(); + this.minpts = minpts; + } + + @SuppressWarnings("unchecked") + @Override + public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type) { + if(TypeUtil.DBIDS.isAssignableFromType(type)) { + return (Instance<T>) new DBIDsInstance(minpts); + } + if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) { + return (Instance<T>) new NeighborListInstance(minpts); + } + throw new AbortException("Incompatible predicate types"); + } + + @Override + public boolean acceptsType(SimpleTypeInformation<?> type) { + if(TypeUtil.DBIDS.isAssignableFromType(type)) { + return true; + } + if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) { + return true; + } + return false; + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class DBIDsInstance implements CorePredicate.Instance<DBIDs> { + /** + * The minpts parameter. + */ + int minpts; + + /** + * Constructor for this predicate. + * + * @param minpts MinPts parameter + */ + public DBIDsInstance(int minpts) { + super(); + this.minpts = minpts; + } + + @Override + public boolean isCorePoint(DBIDRef point, DBIDs neighbors) { + return neighbors.size() >= minpts; + } + } + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static class NeighborListInstance implements CorePredicate.Instance<List<? extends DistanceResultPair<?>>> { + /** + * The minpts parameter. + */ + int minpts; + + /** + * Constructor for this predicate. + * + * @param minpts MinPts parameter + */ + public NeighborListInstance(int minpts) { + super(); + this.minpts = minpts; + } + + @Override + public boolean isCorePoint(DBIDRef point, List<? extends DistanceResultPair<?>> neighbors) { + return neighbors.size() >= minpts; + } + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Minpts value + */ + int minpts; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + // Get the minpts parameter + IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID); + if(config.grab(minptsP)) { + minpts = minptsP.getValue(); + } + } + + @Override + protected MinPtsCorePredicate makeInstance() { + return new MinPtsCorePredicate(minpts); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java new file mode 100644 index 00000000..4f9eca27 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java @@ -0,0 +1,94 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; + +/** + * Get the neighbors of an object + * + * Note the Factory/Instance split of this interface. + * + * @author Erich Schubert + * + * @apiviz.has Instance + */ +public interface NeighborPredicate { + /** + * Instantiate for a database. + * + * @param database Database to instantiate for + * @return Instance + */ + public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type); + + /** + * Input data type restriction. + * + * @return Type restriction + */ + public TypeInformation getInputTypeRestriction(); + + /** + * Output data type information. + * + * @return Type information + */ + public SimpleTypeInformation<?>[] getOutputType(); + + /** + * Instance for a particular data set. + * + * @author Erich Schubert + */ + public static interface Instance<T> { + /** + * Get the neighbors of a reference object for DBSCAN. + * + * @param reference Reference object + * @return Neighborhood + */ + public T getNeighbors(DBIDRef reference); + + /** + * Get the IDs the predicate is defined for. + * + * @return Database ids + */ + public DBIDs getIDs(); + + /** + * Add the neighbors to a DBID set + * + * @param ids ID set + * @param neighbors Neighbors to add + */ + public void addDBIDs(ModifiableDBIDs ids, T neighbors); + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java new file mode 100644 index 00000000..8be23c7d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java @@ -0,0 +1,43 @@ +/** + * <p>Generalized DBSCAN.</p> + * + * Generalized DBSCAN is an abstraction of the original DBSCAN idea, + * that allows the use of arbitrary "neighborhood" and "core point" predicates. + * + * For each object, the neighborhood as defined by the "neighborhood" predicate + * is retrieved - in original DBSCAN, this is the objects within an epsilon sphere + * around the query object. Then the core point predicate is evaluated to decide if + * the object is considered dense. If so, a cluster is started (or extended) to + * include the neighbors as well. + * + * <p> + * Reference:<br /> + * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu:<br /> + * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its + * Applications<br /> + * In: Data Mining and Knowledge Discovery, 1998. + * </p> + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index d3c73b53..92862909 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -1,24 +1,5 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.data.model.MeanModel; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures @@ -42,37 +23,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; + /** * Abstract base class for k-means implementations. * * @author Erich Schubert * + * @apiviz.composedOf KMeansInitialization + * * @param <V> Vector type * @param <D> Distance type */ -public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> { - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater than 0. - */ - public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); - - /** - * Parameter to specify the number of clusters to find, must be an integer - * greater or equal to 0, where 0 means no limit. - */ - public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); - - /** - * Parameter to specify the random generator seed. - */ - public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); - - /** - * Parameter to specify the initialization method - */ - public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); - +public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> implements KMeans { /** * Holds the value of {@link #K_ID}. */ @@ -94,6 +77,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means */ public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { super(distanceFunction); @@ -111,15 +95,15 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param clusters cluster assignment * @return true when the object was reassigned */ - protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) { + protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?, ?>> means, List<? extends ModifiableDBIDs> clusters) { boolean changed = false; if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); @@ -128,13 +112,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis mindist = dist; } } - if(clusters.get(minIndex).add(id)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? for(int i = 0; i < k; i++) { if(i != minIndex) { - if(clusters.get(i).remove(id)) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -144,9 +128,9 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } else { final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); @@ -155,13 +139,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis mindist = dist; } } - if(clusters.get(minIndex).add(id)) { + if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? for(int i = 0; i < k; i++) { if(i != minIndex) { - if(clusters.get(i).remove(id)) { + if(clusters.get(i).remove(iditer)) { break; } } @@ -185,25 +169,23 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis * @param database the database containing the vectors * @return the mean vectors of the given clusters in the given database */ - protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) { + protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> means, Relation<V> database) { List<Vector> newMeans = new ArrayList<Vector>(k); for(int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) { - if(mean == null) { - mean = database.get(clusterIter.next()).getColumnVector(); - } - else { - mean.plusEquals(database.get(clusterIter.next()).getColumnVector()); - } - } if(list.size() > 0) { - assert mean != null; - mean.timesEquals(1.0 / list.size()); + double s = 1.0 / list.size(); + DBIDIter iter = list.iter(); + assert (iter.valid()); + mean = database.get(iter).getColumnVector().timesEquals(s); + iter.advance(); + for(; iter.valid(); iter.advance()) { + mean.plusTimesEquals(database.get(iter).getColumnVector(), s); + } } else { - mean = means.get(i); + mean = means.get(i).getColumnVector(); } newMeans.add(mean); } @@ -211,6 +193,36 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis } /** + * Returns the median vectors of the given clusters in the given database. + * + * @param clusters the clusters to compute the means + * @param medians the recent medians + * @param database the database containing the vectors + * @return the mean vectors of the given clusters in the given database + */ + protected List<NumberVector<?, ?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> medians, Relation<V> database) { + final int dim = medians.get(0).getDimensionality(); + final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); + List<NumberVector<?, ?>> newMedians = new ArrayList<NumberVector<?, ?>>(k); + for(int i = 0; i < k; i++) { + ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); + if(list.size() > 0) { + Vector mean = new Vector(dim); + for(int d = 0; d < dim; d++) { + sorter.setDimension(d + 1); + DBID id = QuickSelect.median(list, sorter); + mean.set(d, database.get(id).doubleValue(d + 1)); + } + newMedians.add(mean); + } + else { + newMedians.add((NumberVector<?, ?>) medians.get(i)); + } + } + return newMedians; + } + + /** * Compute an incremental update for the mean * * @param mean Mean to update @@ -239,16 +251,16 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis */ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { boolean changed = false; - + if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction(); - + // Incremental update - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); @@ -261,13 +273,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis for(int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); if(i == minIndex) { - if(ci.add(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + if(ci.add(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } } - else if(ci.remove(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + else if(ci.remove(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } @@ -276,11 +288,11 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis else { // Raw distance function final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction(); - + // Incremental update - for(DBID id : relation.iterDBIDs()) { + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); - V fv = relation.get(id); + V fv = relation.get(iditer); int minIndex = 0; for(int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); @@ -293,13 +305,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis for(int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); if(i == minIndex) { - if(ci.add(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1); + if(ci.add(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } } - else if(ci.remove(id)) { - incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1); + else if(ci.remove(iditer)) { + incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java index b5f088fb..a8effecd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -22,7 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; @@ -34,9 +33,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter; * * @param <V> Vector type */ -public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> { +public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> { /** - * Holds the value of {@link KMeansLloyd#SEED_ID}. + * Holds the value of {@link KMeans#SEED_ID}. */ protected Long seed; @@ -56,13 +55,13 @@ public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> * * @apiviz.exclude */ - public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer { + public abstract static class Parameterizer<V> extends AbstractParameterizer { protected Long seed; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true); + LongParameter seedP = new LongParameter(KMeans.SEED_ID, true); if(config.grab(seedP)) { seed = seedP.getValue(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java index 78ccd426..7a7f2867 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -23,14 +23,16 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; along with this program. If not, see <http://www.gnu.org/licenses/>. */ import java.util.ArrayList; -import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** @@ -40,20 +42,30 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; * * @param <V> Vector type */ -public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { +public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsInitialization<V> { /** * Constructor. */ public FirstKInitialMeans() { - super(null); + super(); } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { - Iterator<DBID> iter = relation.iterDBIDs(); - List<Vector> means = new ArrayList<Vector>(k); - for(int i = 0; i < k && iter.hasNext(); i++) { - means.add(relation.get(iter.next()).getColumnVector()); + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + DBIDIter iter = relation.iterDBIDs(); + List<V> means = new ArrayList<V>(k); + for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { + means.add(relation.get(iter)); + } + return means; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { + DBIDIter iter = distanceFunction.getRelation().iterDBIDs(); + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { + means.add(iter); } return means; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java new file mode 100644 index 00000000..37171d4a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -0,0 +1,55 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Some constants and options shared among kmeans family algorithms. + * + * @author Erich Schubert + */ +public interface KMeans { + /** + * Parameter to specify the initialization method + */ + public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater than 0. + */ + public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find."); + + /** + * Parameter to specify the number of clusters to find, must be an integer + * greater or equal to 0, where 0 means no limit. + */ + public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit."); + + /** + * Parameter to specify the random generator seed. + */ + public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed."); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java index f4c0d9c7..9e5d69f0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -24,19 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.List; -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** * Interface for initializing K-Means * * @author Erich Schubert * - * @param <V> Vector type + * @param <V> Object type */ -public interface KMeansInitialization<V extends NumberVector<V, ?>> { +public interface KMeansInitialization<V> { /** * Choose initial means * @@ -45,5 +43,5 @@ public interface KMeansInitialization<V extends NumberVector<V, ?>> { * @param distanceFunction Distance function * @return List of chosen means for k-means */ - public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); + public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index fda1d6c0..b1b40632 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -39,7 +39,6 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -94,14 +93,13 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex * @param database Database * @param relation relation to use * @return result - * @throws IllegalStateException */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { if(relation.size() <= 0) { return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?, ?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); for(int i = 0; i < k; i++) { @@ -124,7 +122,7 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); for(int i = 0; i < clusters.size(); i++) { - MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); } return result; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index 56492dd0..c729eb10 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -93,15 +93,17 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> * * @param database Database * @param relation relation to use - * @return result - * @throws IllegalStateException + * @return Clustering result */ - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { if(relation.size() <= 0) { return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<Vector> means = new ArrayList<Vector>(k); + for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { + means.add(nv.getColumnVector()); + } // Initialize cluster and assign objects List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); for(int i = 0; i < k; i++) { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index c7a2fa1d..9afeff6c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -26,19 +26,18 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; -import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -59,7 +58,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; * @param <D> Distance type */ @Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494") -public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> { +public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { /** * Constructor. * @@ -70,7 +69,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("K-Means++ initialization can only be used with numerical distances."); @@ -80,14 +79,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); // Chose first mean - List<Vector> means = new ArrayList<Vector>(k); + List<V> means = new ArrayList<V>(k); Random random = (seed != null) ? new Random(seed) : new Random(); - DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next(); - means.add(relation.get(first).getColumnVector()); + DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iter().getDBID(); + means.add(relation.get(first)); - ModifiableDBIDs chosen = DBIDUtil.newHashSet(k); - chosen.add(first); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); // Initialize weights double[] weights = new double[ids.size()]; @@ -107,16 +104,16 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends } // Add new mean: DBID newmean = ids.get(pos); - means.add(relation.get(newmean).getColumnVector()); - chosen.add(newmean); + means.add(relation.get(newmean)); // Update weights: weights[pos] = 0.0; // Choose optimized version for double distances, if applicable. - if (distF instanceof PrimitiveDoubleDistanceFunction) { + if(distF instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF; weightsum = updateWeights(weights, ids, newmean, ddist, relation); - } else { + } + else { weightsum = updateWeights(weights, ids, newmean, distQ); } } @@ -124,6 +121,48 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends return means; } + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; + // Chose first mean + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + + Random random = (seed != null) ? new Random(seed) : new Random(); + DBID first = DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random.nextLong()).iter().getDBID(); + means.add(first); + + ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs()); + // Initialize weights + double[] weights = new double[ids.size()]; + double weightsum = initialWeights(weights, ids, first, distQ); + while(means.size() < k) { + if(weightsum > Double.MAX_VALUE) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?"); + } + if(weightsum < Double.MIN_NORMAL) { + LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?"); + } + double r = random.nextDouble() * weightsum; + int pos = 0; + while(r > 0 && pos < weights.length) { + r -= weights[pos]; + pos++; + } + // Add new mean: + DBID newmean = ids.get(pos); + means.add(newmean); + // Update weights: + weights[pos] = 0.0; + weightsum = updateWeights(weights, ids, newmean, distQ); + } + + return means; + } + /** * Initialize the weight list. * @@ -133,16 +172,15 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends * @param distQ Distance query * @return Weight sum */ - protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) { double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); - if(latest.equals(id)) { + if(latest.sameDBID(it)) { weights[i] = 0.0; } else { - double d = distQ.distance(latest, id).doubleValue(); + double d = distQ.distance(latest, it).doubleValue(); weights[i] = d * d; } weightsum += weights[i]; @@ -159,13 +197,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends * @param distQ Distance query * @return Weight sum */ - protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) { + protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) { double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); if(weights[i] > 0.0) { - double d = distQ.distance(latest, id).doubleValue(); + double d = distQ.distance(latest, it).doubleValue(); weights[i] = Math.min(weights[i], d * d); weightsum += weights[i]; } @@ -187,9 +224,8 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends double weightsum = 0.0; DBIDIter it = ids.iter(); for(int i = 0; i < weights.length; i++, it.advance()) { - DBID id = it.getDBID(); if(weights[i] > 0.0) { - double d = distF.doubleDistance(lv, rel.get(id)); + double d = distF.doubleDistance(lv, rel.get(it)); weights[i] = Math.min(weights[i], d * d); weightsum += weights[i]; } @@ -204,7 +240,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { return new KMeansPlusPlusInitialMeans<V, D>(seed); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java new file mode 100644 index 00000000..8c284981 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -0,0 +1,172 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medians clustering algorithm, using Lloyd-style bulk + * iterations. + * + * Reference: + * <p> + * Clustering via Concave Minimization<br /> + * P. S. Bradley, O. L. Mangasarian, W. N. Street<br /> + * in: Advances in neural information processing systems + * </p> + * + * @author Erich Schubert + * + * @apiviz.has MeanModel + * + * @param <V> vector datatype + * @param <D> distance value type + */ +@Title("K-Medians") +@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url="http://nips.djvuzone.org/djvu/nips09/0368.djvu") +public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMediansLloyd.class); + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + */ + public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + super(distanceFunction, k, maxiter, initializer); + } + + /** + * Run k-medians + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + } + // Choose initial medians + List<? extends NumberVector<?, ?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { + if(logger.isVerbose()) { + logger.verbose("K-Medians iteration " + (iteration + 1)); + } + boolean changed = assignToNearestCluster(relation, medians, clusters); + // Stop if no cluster assignment changed. + if(!changed) { + break; + } + // Recompute medians. + medians = medians(clusters, medians, relation); + } + // Wrap result + final V factory = DatabaseUtil.assumeVectorField(relation).getFactory(); + Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); + result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + } + return result; + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> { + protected int k; + + protected int maxiter; + + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected AbstractKMeans<V, D> makeInstance() { + return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java new file mode 100644 index 00000000..a5c3d675 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -0,0 +1,271 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medoids clustering algorithm, using a "bulk" variation of the + * "Partitioning Around Medoids" approach. + * + * In contrast to PAM, which will in each iteration update one medoid with one + * (arbitrary) non-medoid, this implementation follows the EM pattern. In the + * expectation step, the best medoid from the cluster members is chosen; in the + * M-step, the objects are reassigned to their nearest medoid. + * + * We do not have a reference for this algorithm. It borrows ideas from EM and + * PAM. If needed, you are welcome cite it using the latest ELKI publication + * (this variation is likely not worth publishing on its own). + * + * @author Erich Schubert + * + * @apiviz.has MedoidModel + * @apiviz.composedOf KMedoidsInitialization + * + * @param <V> vector datatype + * @param <D> distance value type + */ +public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMedoidsEM.class); + + /** + * Holds the value of {@link AbstractKMeans#K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMedoidsInitialization<V> initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means + */ + public KMedoidsEM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Run k-medoids + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering<MedoidModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + } + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + Mean[] mdists = Mean.newArray(k); + + // Initial assignment to nearest medoids + // TODO: reuse this information, from the build phase, when possible? + assignToNearestCluster(medoids, mdists, clusters, distQ); + + // Swap phase + boolean changed = true; + while(changed) { + changed = false; + // Try to swap the medoid with a better cluster member: + for(int i = 0; i < k; i++) { + DBID med = medoids.get(i); + DBID best = null; + Mean bestm = mdists[i]; + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(med.sameDBID(iter)) { + continue; + } + Mean mdist = new Mean(); + for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) { + mdist.put(distQ.distance(iter, iter2).doubleValue()); + } + if(mdist.getMean() < bestm.getMean()) { + best = iter.getDBID(); + bestm = mdist; + } + } + if(best != null && !med.sameDBID(best)) { + changed = true; + medoids.set(i, best); + mdists[i] = bestm; + } + } + // Reassign + if(changed) { + assignToNearestCluster(medoids, mdists, clusters, distQ); + } + } + + // Wrap result + Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param means a list of k means + * @param mdist Mean distances + * @param clusters cluster assignment + * @param distQ distance query + * @return true when the object was reassigned + */ + protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + boolean changed = false; + + double[] dists = new double[k]; + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + int minIndex = 0; + double mindist = Double.POSITIVE_INFINITY; + for(int i = 0; i < k; i++) { + dists[i] = distQ.distance(iditer, means.get(i)).doubleValue(); + if(dists[i] < mindist) { + minIndex = i; + mindist = dists[i]; + } + } + if(clusters.get(minIndex).add(iditer)) { + changed = true; + mdist[minIndex].put(mindist); + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { + mdist[minIndex].put(dists[i], -1); + break; + } + } + } + } + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + protected int k; + + protected int maxiter; + + protected KMedoidsInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected KMedoidsEM<V, D> makeInstance() { + return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java new file mode 100644 index 00000000..269e7e9e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java @@ -0,0 +1,45 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; + +/** + * Interface for initializing K-Medoids. In contrast to k-means initializers, + * this initialization will only return members of the original data set. + * + * @author Erich Schubert + * + * @param <V> Object type + */ +public interface KMedoidsInitialization<V> { + /** + * Choose initial means + * + * @param k Parameter k + * @param distanceFunction Distance function + * @return List of chosen means for k-means + */ + public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction); +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java new file mode 100644 index 00000000..30c80084 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -0,0 +1,310 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.MedoidModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Provides the k-medoids clustering algorithm, using the + * "Partitioning Around Medoids" approach. + * + * Reference: + * <p> + * Clustering my means of Medoids<br /> + * Kaufman, L. and Rousseeuw, P.J.<br /> + * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + * </p> + * + * @author Erich Schubert + * + * @apiviz.has MedoidModel + * @apiviz.composedOf KMedoidsInitialization + * + * @param <V> vector datatype + * @param <D> distance value type + */ +@Title("Partioning Around Medoids") +@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(KMedoidsPAM.class); + + /** + * Holds the value of {@link AbstractKMeans#K_ID}. + */ + protected int k; + + /** + * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + */ + protected int maxiter; + + /** + * Method to choose initial means. + */ + protected KMedoidsInitialization<V> initializer; + + /** + * Constructor. + * + * @param distanceFunction distance function + * @param k k parameter + * @param maxiter Maxiter parameter + * @param initializer Function to generate the initial means + */ + public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + super(distanceFunction); + this.k = k; + this.maxiter = maxiter; + this.initializer = initializer; + } + + /** + * Run k-medoids + * + * @param database Database + * @param relation relation to use + * @return result + */ + public Clustering<MedoidModel> run(Database database, Relation<V> relation) { + if(relation.size() <= 0) { + return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + } + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + DBIDs ids = relation.getDBIDs(); + // Choose initial medoids + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + // Setup cluster assignment store + List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + for(int i = 0; i < k; i++) { + clusters.add(DBIDUtil.newHashSet(relation.size() / k)); + } + + WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + // Initial assignment to nearest medoids + // TODO: reuse this information, from the build phase, when possible? + assignToNearestCluster(medoids, ids, second, clusters, distQ); + + // Swap phase + boolean changed = true; + while(changed) { + changed = false; + // Try to swap the medoid with a better cluster member: + double best = 0; + DBID bestid = null; + int bestcluster = -1; + for(int i = 0; i < k; i++) { + DBID med = medoids.get(i); + for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) { + if(med.sameDBID(iter)) { + continue; + } + // double disti = distQ.distance(id, med).doubleValue(); + double cost = 0; + for(int j = 0; j < k; j++) { + for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { + double distcur = distQ.distance(iter2, medoids.get(j)).doubleValue(); + double distnew = distQ.distance(iter2, iter).doubleValue(); + if(j == i) { + // Cases 1 and 2. + double distsec = second.doubleValue(iter2); + if(distcur > distsec) { + // Case 1, other would switch to a third medoid + cost += distsec - distcur; // Always positive! + } + else { // Would remain with the candidate + cost += distnew - distcur; // Could be negative + } + } + else { + // Cases 3-4: objects from other clusters + if (distcur < distnew) { + // Case 3: no change + } else { + // Case 4: would switch to new medoid + cost += distnew - distcur; // Always negative + } + } + } + } + if (cost < best) { + best = cost; + bestid = iter.getDBID(); + bestcluster = i; + } + } + } + if(logger.isDebugging()) { + logger.debug("Best cost: " + best); + } + if(bestid != null) { + changed = true; + medoids.set(bestcluster, bestid); + } + // Reassign + if(changed) { + // TODO: can we save some of these recomputations? + assignToNearestCluster(medoids, ids, second, clusters, distQ); + } + } + + // Wrap result + Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + } + return result; + } + + /** + * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of + * those FeatureVectors, that are nearest to the k<sup>th</sup> mean. + * + * @param means Object centroids + * @param ids Object ids + * @param second Distance to second nearest medoid + * @param clusters cluster assignment + * @param distQ distance query + * @return true when any object was reassigned + */ + protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + boolean changed = false; + + for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + int minIndex = 0; + double mindist = Double.POSITIVE_INFINITY; + double mindist2 = Double.POSITIVE_INFINITY; + for(int i = 0; i < k; i++) { + double dist = distQ.distance(iditer, means.get(i)).doubleValue(); + if(dist < mindist) { + minIndex = i; + mindist2 = mindist; + mindist = dist; + } + else if(dist < mindist2) { + mindist2 = dist; + } + } + if(clusters.get(minIndex).add(iditer)) { + changed = true; + // Remove from previous cluster + // TODO: keep a list of cluster assignments to save this search? + for(int i = 0; i < k; i++) { + if(i != minIndex) { + if(clusters.get(i).remove(iditer)) { + break; + } + } + } + } + second.put(iditer, mindist2); + } + return changed; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + protected int k; + + protected int maxiter; + + protected KMedoidsInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0)); + if(config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + if(config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0); + if(config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + @Override + protected KMedoidsPAM<V, D> makeInstance() { + return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java new file mode 100644 index 00000000..094c37bb --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java @@ -0,0 +1,187 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * PAM initialization for k-means (and of course, PAM). + * + * Reference: + * <p> + * Clustering my means of Medoids<br /> + * Kaufman, L. and Rousseeuw, P.J.<br /> + * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + * </p> + * + * TODO: enforce using a distance matrix? + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") +public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMeansInitialization<V>, KMedoidsInitialization<V> { + /** + * Constructor. + */ + public PAMInitialMeans() { + super(); + } + + @Override + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + // Get a distance query + if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; + final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + DBIDs medids = chooseInitialMedoids(k, distQ); + List<V> medoids = new ArrayList<V>(k); + for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) { + medoids.add(relation.get(iter)); + } + return medoids; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { + if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("PAM initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; + final DBIDs ids = distQ.getRelation().getDBIDs(); + + ArrayModifiableDBIDs medids = DBIDUtil.newArray(k); + double best = Double.POSITIVE_INFINITY; + Mean mean = new Mean(); // Mean is numerically more stable than sum. + WritableDoubleDataStore mindist = null; + + // First mean is chosen by having the smallest distance sum to all others. + { + DBID bestid = null; + WritableDoubleDataStore bestd = null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + mean.reset(); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + double d = distQ.distance(iter, iter2).doubleValue(); + mean.put(d); + newd.putDouble(iter2, d); + } + if(mean.getMean() < best) { + best = mean.getMean(); + bestid = iter.getDBID(); + if(bestd != null) { + bestd.destroy(); + } + bestd = newd; + } + else { + newd.destroy(); + } + } + medids.add(bestid); + mindist = bestd; + } + assert (mindist != null); + + // Subsequent means optimize the full criterion. + for(int i = 1; i < k; i++) { + DBID bestid = null; + WritableDoubleDataStore bestd = null; + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); + if(medids.contains(id)) { + continue; + } + WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + mean.reset(); + for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + DBID other = iter2.getDBID(); + double dn = distQ.distance(id, other).doubleValue(); + double v = Math.min(dn, mindist.doubleValue(other)); + mean.put(v); + newd.put(other, v); + } + assert (mean.getCount() == ids.size()); + if(mean.getMean() < best) { + best = mean.getMean(); + bestid = id; + if(bestd != null) { + bestd.destroy(); + } + bestd = newd; + } + else { + newd.destroy(); + } + } + if(bestid == null) { + throw new AbortException("No median found that improves the criterion function?!?"); + } + medids.add(bestid); + mindist.destroy(); + mindist = bestd; + } + + mindist.destroy(); + return medids; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + @Override + protected PAMInitialMeans<V, D> makeInstance() { + return new PAMInitialMeans<V, D>(); + } + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java index 30e59453..5b9da923 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -25,13 +25,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.data.NumberVector; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; /** * Initialize K-means by randomly choosing k exsiting elements as cluster @@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; * * @param <V> Vector type */ -public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> { +public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { /** * Constructor. * @@ -52,15 +51,20 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed); - List<Vector> means = new ArrayList<Vector>(k); - for(DBID id : ids) { - means.add(relation.get(id).getColumnVector()); + List<V> means = new ArrayList<V>(k); + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + means.add(relation.get(iter)); } return means; } + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) { + return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, seed); + } + /** * Parameterization class. * @@ -68,7 +72,7 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected RandomlyChosenInitialMeans<V> makeInstance() { diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index e8a466dd..00ed08c4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -30,7 +30,6 @@ import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.math.MathUtil; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @@ -53,10 +52,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends } @Override - public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { final int dim = DatabaseUtil.dimensionality(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<Vector> means = new ArrayList<Vector>(k); + List<V> means = new ArrayList<V>(k); final Random random = (this.seed != null) ? new Random(this.seed) : new Random(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); @@ -64,12 +63,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends for(int d = 0; d < dim; d++) { r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d]; } - means.add(new Vector(r)); + means.add(minmax.first.newNumberVector(r)); } return means; } - /** * Parameterization class. * @@ -78,7 +76,6 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends * @apiviz.exclude */ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { - @Override protected RandomlyGeneratedInitialMeans<V> makeInstance() { return new RandomlyGeneratedInitialMeans<V>(seed); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index e3b274a6..01a693e4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -27,14 +27,12 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEUnit; import de.lmu.ifi.dbs.elki.data.Cluster; @@ -46,6 +44,7 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -97,7 +96,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications") @Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.") @Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314") -public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> { +public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ @@ -299,8 +298,8 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus minima[d] = Double.MAX_VALUE; } // update minima and maxima - for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) { - V featureVector = database.get(it.next()); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V featureVector = database.get(it); updateMinMax(featureVector, minima, maxima); } for(int i = 0; i < maxima.length; i++) { @@ -393,13 +392,15 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus // identify dense units double total = database.size(); - for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) { - final DBID id = it.next(); - V featureVector = database.get(id); + for(DBIDIter it = database.iterDBIDs(); it.valid();) { + V featureVector = database.get(it); + final DBID id = it.getDBID(); + it.advance(); for(CLIQUEUnit<V> unit : units) { unit.addFeatureVector(id, featureVector); // unit is a dense unit - if(!it.hasNext() && unit.selectivity(total) >= tau) { + // FIXME: why it.valid()? + if(!it.valid() && unit.selectivity(total) >= tau) { denseUnits.add(unit); // add the dense unit to its subspace int dim = unit.getIntervals().iterator().next().getDimension(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index c4c1687b..df3fe8b5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -34,7 +34,6 @@ import java.util.List; import java.util.Map; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; @@ -100,7 +99,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; @Title("DiSH: Detecting Subspace cluster Hierarchies") @Description("Algorithm to find hierarchical correlation clusters in subspaces.") @Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15") -public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> { +public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ @@ -162,8 +161,11 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste /** * Performs the DiSH algorithm on the given database. + * + * @param database Database to process + * @param relation Relation to process */ - public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { // Instantiate DiSH distance (and thus run the preprocessor) if(logger.isVerbose()) { logger.verbose("*** Run DiSH preprocessor."); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 3f16e907..4eedbecd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -28,7 +28,6 @@ import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; @@ -39,13 +38,13 @@ import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.Subspace; -import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; @@ -87,11 +86,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param <V> the type of NumberVector handled by this Algorithm */ +// TODO: optimize by creating much less objects @Title("PROCLUS: PROjected CLUStering") @Description("Algorithm to find subspace clusters in high dimensional spaces.") @Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188") -// TODO: make the generics reflect the SubspaceModel -public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClustering<Clustering<Model>, V> { +public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ @@ -141,8 +140,11 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus /** * Performs the PROCLUS algorithm on the given database. + * + * @param database Database to process + * @param relation Relation to process */ - public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException { + public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); final Random random = new Random(); @@ -193,6 +195,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Current number of clusters:", logger) : null; + // TODO: Use DataStore and Trove for performance Map<DBID, PROCLUSCluster> clusters = null; int loops = 0; while(loops < 10) { @@ -229,9 +232,9 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus // build result int numClusters = 1; - Clustering<Model> result = new Clustering<Model>("ProClus clustering", "proclus-clustering"); + Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("ProClus clustering", "proclus-clustering"); for(PROCLUSCluster c : finalClusters) { - Cluster<Model> cluster = new Cluster<Model>(c.objectIDs); + Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.objectIDs); cluster.setModel(new SubspaceModel<V>(new Subspace<V>(c.getDimensions()), c.centroid)); cluster.setName("cluster_" + numClusters++); @@ -262,7 +265,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus // compute distances between each point in S and m_i Map<DBID, DistanceResultPair<DoubleDistance>> distances = new HashMap<DBID, DistanceResultPair<DoubleDistance>>(); - for(DBID id : s) { + for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); DoubleDistance dist = distFunc.distance(id, m_i); distances.put(id, new GenericDistanceResultPair<DoubleDistance>(dist, id)); } @@ -278,7 +282,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus distances.remove(m_i); // compute distances of each point to closest medoid - for(DBID id : s) { + for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); DoubleDistance dist_new = distFunc.distance(id, m_i); DoubleDistance dist_old = distances.get(id).getDistance(); @@ -323,12 +328,11 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus */ private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) { ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m); - for(DBID m_i : m_best) { - m_list.remove(m_i); - } + m_list.removeDBIDs(m_best); ModifiableDBIDs m_current = DBIDUtil.newHashSet(); - for(DBID m_i : m_best) { + for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { + DBID m_i = iter.getDBID(); if(m_bad.contains(m_i)) { int currentSize = m_current.size(); while(m_current.size() == currentSize) { @@ -358,11 +362,13 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus private Map<DBID, List<DistanceResultPair<DoubleDistance>>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { Map<DBID, List<DistanceResultPair<DoubleDistance>>> result = new HashMap<DBID, List<DistanceResultPair<DoubleDistance>>>(); - for(DBID m : medoids) { + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + DBID m = iter.getDBID(); // determine minimum distance between current medoid m and any other // medoid m_i DoubleDistance minDist = null; - for(DBID m_i : medoids) { + for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { + DBID m_i = iter2.getDBID(); if(m_i == m) { continue; } @@ -399,7 +405,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus int dim = DatabaseUtil.dimensionality(database); Map<DBID, double[]> averageDistances = new HashMap<DBID, double[]>(); - for(DBID m_i : medoids) { + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + DBID m_i = iter.getDBID(); V medoid_i = database.get(m_i); List<DistanceResultPair<DoubleDistance>> l_i = localities.get(m_i); double[] x_i = new double[dim]; @@ -417,7 +424,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus Map<DBID, Set<Integer>> dimensionMap = new HashMap<DBID, Set<Integer>>(); List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<CTriple<Double, DBID, Integer>>(); - for(DBID m_i : medoids) { + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + DBID m_i = iter.getDBID(); Set<Integer> dims_i = new HashSet<Integer>(); dimensionMap.put(m_i, dims_i); @@ -478,8 +486,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus for(int i = 0; i < clusters.size(); i++) { PROCLUSCluster c_i = clusters.get(i); double[] x_i = new double[dim]; - for(DBID id : c_i.objectIDs) { - V o = database.get(id); + for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { + V o = database.get(iter); for(int d = 0; d < dim; d++) { x_i[d] += Math.abs(c_i.centroid.doubleValue(d + 1) - o.doubleValue(d + 1)); } @@ -560,8 +568,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus clusterIDs.put(m_i, DBIDUtil.newHashSet()); } - for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) { - DBID p_id = it.next(); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + DBID p_id = it.getDBID(); V p = database.get(p_id); DistanceResultPair<DoubleDistance> minDist = null; for(DBID m_i : dimensions.keySet()) { @@ -610,8 +618,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus clusterIDs.put(i, DBIDUtil.newHashSet()); } - for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) { - DBID p_id = it.next(); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + DBID p_id = it.getDBID(); V p = database.get(p_id); Pair<DoubleDistance, Integer> minDist = null; for(int i = 0; i < dimensions.size(); i++) { @@ -707,8 +715,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus */ private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) { double avg = 0; - for(DBID objectID : objectIDs) { - V o = database.get(objectID); + for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { + V o = database.get(iter); avg += Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension)); } return avg / objectIDs.size(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index 963c0922..c47c74b6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -30,7 +30,6 @@ import java.util.List; import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; @@ -77,7 +76,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Elke Achtert * * @apiviz.uses DBSCAN - * @apiviz.uses DimensionsSelectingEuclideanDistanceFunction + * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction * @apiviz.has SubspaceModel * * @param <V> the type of FeatureVector handled by this Algorithm @@ -85,7 +84,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @Title("SUBCLU: Density connected Subspace Clustering") @Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.") @Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004") -public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> { +public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { /** * The logger for this class. */ @@ -162,7 +161,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus * @param relation Relation to process * @return Clustering result */ - public Clustering<SubspaceModel<V>> run(Relation<V> relation) throws IllegalStateException { + public Clustering<SubspaceModel<V>> run(Relation<V> relation) { final int dimensionality = DatabaseUtil.dimensionality(relation); StepProgress stepprog = logger.isVerbose() ? new StepProgress(dimensionality) : null; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java new file mode 100644 index 00000000..17eb3c19 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java @@ -0,0 +1,39 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; + +/** + * Interface for subspace clustering algorithms that use a model derived from + * {@link SubspaceModel}, that can then be post-processed for outlier detection. + * + * @author Erich Schubert + * + * @param <M> Model type + */ +public interface SubspaceClusteringAlgorithm<M extends SubspaceModel<?>> extends ClusteringAlgorithm<Clustering<M>> { + // No additional constraints +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java index 43c6a218..ee42a59f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java @@ -39,7 +39,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -137,12 +141,12 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl * @param relation The data input we use */ public Clustering<Model> run(Relation<?> relation) { - HashMap<String, ModifiableDBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation); + HashMap<String, DBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation); ModifiableDBIDs noiseids = DBIDUtil.newArray(); Clustering<Model> result = new Clustering<Model>("By Label Clustering", "bylabel-clustering"); - for(Entry<String, ModifiableDBIDs> entry : labelMap.entrySet()) { - ModifiableDBIDs ids = entry.getValue(); + for(Entry<String, DBIDs> entry : labelMap.entrySet()) { + DBIDs ids = entry.getValue(); if(ids.size() <= 1) { noiseids.addDBIDs(ids); continue; @@ -170,12 +174,13 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl * @param data the database storing the objects * @return a mapping of labels to ids */ - private HashMap<String, ModifiableDBIDs> singleAssignment(Relation<?> data) { - HashMap<String, ModifiableDBIDs> labelMap = new HashMap<String, ModifiableDBIDs>(); + private HashMap<String, DBIDs> singleAssignment(Relation<?> data) { + HashMap<String, DBIDs> labelMap = new HashMap<String, DBIDs>(); - for(DBID id : data.iterDBIDs()) { - String label = data.get(id).toString(); - assign(labelMap, label, id); + for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { + final Object val = data.get(iditer); + String label = (val != null) ? val.toString() : null; + assign(labelMap, label, iditer); } return labelMap; } @@ -187,13 +192,13 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl * @param data the database storing the objects * @return a mapping of labels to ids */ - private HashMap<String, ModifiableDBIDs> multipleAssignment(Relation<?> data) { - HashMap<String, ModifiableDBIDs> labelMap = new HashMap<String, ModifiableDBIDs>(); + private HashMap<String, DBIDs> multipleAssignment(Relation<?> data) { + HashMap<String, DBIDs> labelMap = new HashMap<String, DBIDs>(); - for(DBID id : data.iterDBIDs()) { - String[] labels = data.get(id).toString().split(" "); + for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { + String[] labels = data.get(iditer).toString().split(" "); for(String label : labels) { - assign(labelMap, label, id); + assign(labelMap, label, iditer); } } return labelMap; @@ -206,14 +211,22 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl * @param label the label of the object to be assigned * @param id the id of the object to be assigned */ - private void assign(HashMap<String, ModifiableDBIDs> labelMap, String label, DBID id) { + private void assign(HashMap<String, DBIDs> labelMap, String label, DBIDRef id) { if(labelMap.containsKey(label)) { - labelMap.get(label).add(id); + DBIDs exist = labelMap.get(label); + if (exist instanceof DBID) { + ModifiableDBIDs n = DBIDUtil.newHashSet(); + n.add((DBID)exist); + n.add(id); + labelMap.put(label, n); + } else { + assert(exist instanceof HashSetModifiableDBIDs); + assert (exist.size() > 1); + ((ModifiableDBIDs)exist).add(id); + } } else { - ModifiableDBIDs n = DBIDUtil.newHashSet(); - n.add(id); - labelMap.put(label, n); + labelMap.put(label, id.getDBID()); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java index 228cc7e7..26bf525a 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java @@ -39,7 +39,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -96,28 +100,26 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< * * @param relation The data input to use */ - public Clustering<Model> run(Relation<?> relation) throws IllegalStateException { - HashMap<String, ModifiableDBIDs> labelmap = new HashMap<String, ModifiableDBIDs>(); + public Clustering<Model> run(Relation<?> relation) { + HashMap<String, DBIDs> labelmap = new HashMap<String, DBIDs>(); ModifiableDBIDs noiseids = DBIDUtil.newArray(); - for(DBID id : relation.iterDBIDs()) { - String label = relation.get(id).toString(); - - if(labelmap.containsKey(label)) { - labelmap.get(label).add(id); - } - else { - ModifiableDBIDs n = DBIDUtil.newHashSet(); - n.add(id); - labelmap.put(label, n); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final Object val = relation.get(iditer); + if(val == null) { + noiseids.add(iditer); + continue; } + String label = val.toString(); + + assign(labelmap, label, iditer); } ArrayList<Cluster<Model>> clusters = new ArrayList<Cluster<Model>>(labelmap.size()); - for(Entry<String, ModifiableDBIDs> entry : labelmap.entrySet()) { - ModifiableDBIDs ids = entry.getValue(); - if(ids.size() <= 1) { - noiseids.addDBIDs(ids); + for(Entry<String, DBIDs> entry : labelmap.entrySet()) { + DBIDs ids = entry.getValue(); + if(ids instanceof DBID) { + noiseids.add((DBID) ids); continue; } Cluster<Model> clus = new Cluster<Model>(entry.getKey(), ids, ClusterModel.CLUSTER, new ArrayList<Cluster<Model>>(), new ArrayList<Cluster<Model>>()); @@ -153,6 +155,33 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< return new Clustering<Model>("By Label Hierarchical Clustering", "bylabel-clustering", rootclusters); } + /** + * Assigns the specified id to the labelMap according to its label + * + * @param labelMap the mapping of label to ids + * @param label the label of the object to be assigned + * @param id the id of the object to be assigned + */ + private void assign(HashMap<String, DBIDs> labelMap, String label, DBIDRef id) { + if(labelMap.containsKey(label)) { + DBIDs exist = labelMap.get(label); + if(exist instanceof DBID) { + ModifiableDBIDs n = DBIDUtil.newHashSet(); + n.add((DBID) exist); + n.add(id); + labelMap.put(label, n); + } + else { + assert (exist instanceof HashSetModifiableDBIDs); + assert (exist.size() > 1); + ((ModifiableDBIDs) exist).add(id); + } + } + else { + labelMap.put(label, id.getDBID()); + } + } + @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.GUESSED_LABEL); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java new file mode 100644 index 00000000..f082db9c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java @@ -0,0 +1,74 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; + +import de.lmu.ifi.dbs.elki.data.ClassLabel; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Trivial class that will try to cluster by label, and fall back to an + * "all-in-one" clustering. + * + * @author Erich Schubert + */ +public class ByLabelOrAllInOneClustering extends ByLabelClustering { + /** + * Constructor. + */ + public ByLabelOrAllInOneClustering() { + super(); + } + + @Override + public Clustering<Model> run(Database database) { + // Prefer a true class label + try { + Relation<ClassLabel> relation = database.getRelation(TypeUtil.CLASSLABEL); + return run(relation); + } + catch(NoSupportedDataTypeException e) { + // Ignore. + } + try { + Relation<ClassLabel> relation = database.getRelation(TypeUtil.GUESSED_LABEL); + return run(relation); + } + catch(NoSupportedDataTypeException e) { + // Ignore. + } + final DBIDs ids = database.getRelation(TypeUtil.ANY).getDBIDs(); + Clustering<Model> result = new Clustering<Model>("All-in-one trivial Clustering", "allinone-clustering"); + Cluster<Model> c = new Cluster<Model>(ids, ClusterModel.CLUSTER); + result.addCluster(c); + return result; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java index cd45cda2..90ca3625 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java @@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -102,14 +102,14 @@ public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> impl public Clustering<Model> run(Relation<Model> relation) { // Build model mapping HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<Model, ModifiableDBIDs>(); - for(DBID id : relation.iterDBIDs()) { - Model model = relation.get(id); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + Model model = relation.get(iditer); ModifiableDBIDs modelids = modelMap.get(model); if(modelids == null) { modelids = DBIDUtil.newHashSet(); modelMap.put(model, modelids); } - modelids.add(id); + modelids.add(iditer); } Clustering<Model> result = new Clustering<Model>("By Model Clustering", "bymodel-clustering"); |