summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java3
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java56
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java41
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java66
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java19
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java37
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java40
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java24
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java30
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java80
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java268
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java323
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java178
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java94
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java43
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java168
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java32
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java55
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java84
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java172
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java271
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java45
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java310
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java187
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java22
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java19
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java58
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java39
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java49
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java61
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java74
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java8
46 files changed, 2664 insertions, 402 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
index ea441655..670a3f0f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
@@ -27,7 +27,6 @@ import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.PROCLUS;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
@@ -49,7 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
* @param <R> the result we return
* @param <V> the type of FeatureVector handled by this Algorithm
*/
-public abstract class AbstractProjectedClustering<R extends Clustering<Model>, V extends NumberVector<V, ?>> extends AbstractAlgorithm<R> implements ClusteringAlgorithm<R> {
+public abstract class AbstractProjectedClustering<R extends Clustering<?>, V extends NumberVector<V, ?>> extends AbstractAlgorithm<R> implements ClusteringAlgorithm<R> {
/**
* Parameter to specify the number of clusters to find, must be an integer
* greater than 0.
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
index 108ba0ed..250cc70b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
@@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
@@ -166,7 +167,14 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
this.lambda = lambda;
}
- public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException {
+ /**
+ * Run the algorithm
+ *
+ * @param database Database to process
+ * @param relation Relation to process
+ * @return Clustering result
+ */
+ public Clustering<Model> run(Database database, Relation<V> relation) {
FiniteProgress objprog = getLogger().isVerbose() ? new FiniteProgress("Processing objects", relation.size(), getLogger()) : null;
IndefiniteProgress clusprog = getLogger().isVerbose() ? new IndefiniteProgress("Number of clusters", getLogger()) : null;
resultList = new ArrayList<ModifiableDBIDs>();
@@ -177,9 +185,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
if(relation.size() >= minpts) {
- for(DBID id : relation.iterDBIDs()) {
- if(!processedIDs.contains(id)) {
- expandCluster(distFunc, rangeQuery, id, objprog, clusprog);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(!processedIDs.contains(iditer)) {
+ expandCluster(distFunc, rangeQuery, iditer.getDBID(), objprog, clusprog);
if(processedIDs.size() == relation.size() && noise.size() == 0) {
break;
}
@@ -191,8 +199,8 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
}
}
else {
- for(DBID id : relation.iterDBIDs()) {
- noise.add(id);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ noise.add(iditer);
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
@@ -284,28 +292,26 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
for(DistanceResultPair<DoubleDistance> seed : seeds) {
- DBID nextID = seed.getDBID();
-
- Integer nextID_corrDim = distFunc.getIndex().getLocalProjection(nextID).getCorrelationDimension();
+ int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension();
// nextID is not reachable from start object
if(nextID_corrDim > lambda) {
continue;
}
- if(!processedIDs.contains(nextID)) {
- currentCluster.add(nextID);
- processedIDs.add(nextID);
+ if(!processedIDs.contains(seed)) {
+ currentCluster.add(seed);
+ processedIDs.add(seed);
}
- else if(noise.contains(nextID)) {
- currentCluster.add(nextID);
- noise.remove(nextID);
+ else if(noise.contains(seed)) {
+ currentCluster.add(seed);
+ noise.remove(seed);
}
}
seeds.remove(0);
while(seeds.size() > 0) {
- DBID q = seeds.remove(0).getDBID();
- Integer corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension();
+ DistanceResultPair<DoubleDistance> q = seeds.remove(0);
+ int corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension();
// q forms no lambda-dim hyperplane
if(corrDim_q > lambda) {
continue;
@@ -314,22 +320,22 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
List<DistanceResultPair<DoubleDistance>> reachables = rangeQuery.getRangeForDBID(q, epsilon);
if(reachables.size() > minpts) {
for(DistanceResultPair<DoubleDistance> r : reachables) {
- Integer corrDim_r = distFunc.getIndex().getLocalProjection(r.getDBID()).getCorrelationDimension();
+ int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension();
// r is not reachable from q
if(corrDim_r > lambda) {
continue;
}
- boolean inNoise = noise.contains(r.getDBID());
- boolean unclassified = !processedIDs.contains(r.getDBID());
+ boolean inNoise = noise.contains(r);
+ boolean unclassified = !processedIDs.contains(r);
if(inNoise || unclassified) {
if(unclassified) {
seeds.add(r);
}
- currentCluster.add(r.getDBID());
- processedIDs.add(r.getDBID());
+ currentCluster.add(r);
+ processedIDs.add(r);
if(inNoise) {
- noise.remove(r.getDBID());
+ noise.remove(r);
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
@@ -349,9 +355,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
resultList.add(currentCluster);
}
else {
- for(DBID id : currentCluster) {
- noise.add(id);
- }
+ noise.addDBIDs(currentCluster);
noise.add(startObjectID);
processedIDs.add(startObjectID);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java
index 5ec59777..8f637460 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java
@@ -47,5 +47,5 @@ import de.lmu.ifi.dbs.elki.database.Database;
*/
public interface ClusteringAlgorithm<C extends Clustering<? extends Model>> extends Algorithm {
@Override
- C run(Database database) throws IllegalStateException;
+ C run(Database database);
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
index b59af555..6bafa9e9 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
@@ -141,9 +142,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
noise = DBIDUtil.newHashSet();
processedIDs = DBIDUtil.newHashSet(size);
if(size >= minpts) {
- for(DBID id : relation.iterDBIDs()) {
- if(!processedIDs.contains(id)) {
- expandCluster(relation, rangeQuery, id, objprog, clusprog);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(!processedIDs.contains(iditer)) {
+ expandCluster(relation, rangeQuery, iditer.getDBID(), objprog, clusprog);
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), logger);
@@ -155,8 +156,8 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
}
else {
- for(DBID id : relation.iterDBIDs()) {
- noise.add(id);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ noise.add(iditer);
if(objprog != null && clusprog != null) {
objprog.setProcessed(noise.size(), logger);
clusprog.setProcessed(resultList.size(), logger);
@@ -210,35 +211,33 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
for(DistanceResultPair<D> seed : seeds) {
- DBID nextID = seed.getDBID();
- if(!processedIDs.contains(nextID)) {
- currentCluster.add(nextID);
- processedIDs.add(nextID);
+ if(!processedIDs.contains(seed)) {
+ currentCluster.add(seed);
+ processedIDs.add(seed);
}
- else if(noise.contains(nextID)) {
- currentCluster.add(nextID);
- noise.remove(nextID);
+ else if(noise.contains(seed)) {
+ currentCluster.add(seed);
+ noise.remove(seed);
}
}
seeds.remove(0);
while(seeds.size() > 0) {
- DBID o = seeds.remove(0).getDBID();
+ DistanceResultPair<D> o = seeds.remove(0);
List<DistanceResultPair<D>> neighborhood = rangeQuery.getRangeForDBID(o, epsilon);
if(neighborhood.size() >= minpts) {
for(DistanceResultPair<D> neighbor : neighborhood) {
- DBID p = neighbor.getDBID();
- boolean inNoise = noise.contains(p);
- boolean unclassified = !processedIDs.contains(p);
+ boolean inNoise = noise.contains(neighbor);
+ boolean unclassified = !processedIDs.contains(neighbor);
if(inNoise || unclassified) {
if(unclassified) {
seeds.add(neighbor);
}
- currentCluster.add(p);
- processedIDs.add(p);
+ currentCluster.add(neighbor);
+ processedIDs.add(neighbor);
if(inNoise) {
- noise.remove(p);
+ noise.remove(neighbor);
}
}
}
@@ -258,9 +257,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
resultList.add(currentCluster);
}
else {
- for(DBID id : currentCluster) {
- noise.add(id);
- }
+ noise.addDBIDs(currentCluster);
noise.add(startObjectID);
processedIDs.add(startObjectID);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
index f1e6c945..a0780e3d 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
*/
import java.util.Collection;
-import java.util.Iterator;
import java.util.List;
import java.util.Set;
@@ -36,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.DistanceUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
@@ -201,11 +201,11 @@ public class DeLiClu<NV extends NumberVector<NV, ?>, D extends Distance<D>> exte
* @return the id of the start object for the run method
*/
private DBID getStartObject(Relation<NV> relation) {
- Iterator<DBID> it = relation.iterDBIDs();
- if(!it.hasNext()) {
+ DBIDIter it = relation.iterDBIDs();
+ if(!it.valid()) {
return null;
}
- return it.next();
+ return it.getDBID();
}
/**
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
index a70a3f6f..63ebbabb 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
@@ -39,7 +39,8 @@ import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -170,28 +171,31 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
if(logger.isVerbose()) {
logger.verbose("initializing " + k + " models");
}
- List<Vector> means = initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC);
+ List<Vector> means = new ArrayList<Vector>();
+ for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC)) {
+ means.add(nv.getColumnVector());
+ }
List<Matrix> covarianceMatrices = new ArrayList<Matrix>(k);
- List<Double> normDistrFactor = new ArrayList<Double>(k);
+ double[] normDistrFactor = new double[k];
List<Matrix> invCovMatr = new ArrayList<Matrix>(k);
- List<Double> clusterWeights = new ArrayList<Double>(k);
+ double[] clusterWeights = new double[k];
probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
final int dimensionality = means.get(0).getDimensionality();
for(int i = 0; i < k; i++) {
Matrix m = Matrix.identity(dimensionality, dimensionality);
covarianceMatrices.add(m);
- normDistrFactor.add(1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det()));
+ normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det());
invCovMatr.add(m.inverse());
- clusterWeights.add(1.0 / k);
+ clusterWeights[i] = 1.0 / k;
if(logger.isDebuggingFinest()) {
StringBuffer msg = new StringBuffer();
msg.append(" model ").append(i).append(":\n");
msg.append(" mean: ").append(means.get(i)).append("\n");
msg.append(" m:\n").append(FormatUtil.format(m, " ")).append("\n");
msg.append(" m.det(): ").append(m.det()).append("\n");
- msg.append(" cluster weight: ").append(clusterWeights.get(i)).append("\n");
- msg.append(" normDistFact: ").append(normDistrFactor.get(i)).append("\n");
+ msg.append(" cluster weight: ").append(clusterWeights[i]).append("\n");
+ msg.append(" normDistFact: ").append(normDistrFactor[i]).append("\n");
logger.debugFine(msg.toString());
}
}
@@ -216,31 +220,31 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
double[] sumOfClusterProbabilities = new double[k];
for(int i = 0; i < k; i++) {
- clusterWeights.set(i, 0.0);
+ clusterWeights[i] = 0.0;
meanSums.add(new Vector(dimensionality));
covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality));
}
// weights and means
- for(DBID id : relation.iterDBIDs()) {
- double[] clusterProbabilities = probClusterIGivenX.get(id);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
for(int i = 0; i < k; i++) {
sumOfClusterProbabilities[i] += clusterProbabilities[i];
- Vector summand = relation.get(id).getColumnVector().timesEquals(clusterProbabilities[i]);
+ Vector summand = relation.get(iditer).getColumnVector().timesEquals(clusterProbabilities[i]);
meanSums.get(i).plusEquals(summand);
}
}
final int n = relation.size();
for(int i = 0; i < k; i++) {
- clusterWeights.set(i, sumOfClusterProbabilities[i] / n);
+ clusterWeights[i] = sumOfClusterProbabilities[i] / n;
Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]);
means.set(i, newMean);
}
// covariance matrices
- for(DBID id : relation.iterDBIDs()) {
- double[] clusterProbabilities = probClusterIGivenX.get(id);
- Vector instance = relation.get(id).getColumnVector();
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
+ Vector instance = relation.get(iditer).getColumnVector();
for(int i = 0; i < k; i++) {
Vector difference = instance.minus(means.get(i));
covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i]));
@@ -250,7 +254,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT));
}
for(int i = 0; i < k; i++) {
- normDistrFactor.set(i, 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * covarianceMatrices.get(i).det()));
+ normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * covarianceMatrices.get(i).det());
invCovMatr.set(i, covarianceMatrices.get(i).inverse());
}
// reassign probabilities
@@ -269,8 +273,8 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
}
// provide a hard clustering
- for(DBID id : relation.iterDBIDs()) {
- double[] clusterProbabilities = probClusterIGivenX.get(id);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
int maxIndex = 0;
double currentMax = 0.0;
for(int i = 0; i < k; i++) {
@@ -279,7 +283,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
currentMax = clusterProbabilities[i];
}
}
- hardClusters.get(maxIndex).add(id);
+ hardClusters.get(maxIndex).add(iditer);
}
final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
Clustering<EMModel<V>> result = new Clustering<EMModel<V>>("EM Clustering", "em-clustering");
@@ -309,25 +313,25 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
* @param clusterWeights the weights of the current clusters
* @return the expectation value of the current mixture of distributions
*/
- protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
+ protected double assignProbabilitiesToInstances(Relation<V> database, double[] normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
double emSum = 0.0;
- for(DBID id : database.iterDBIDs()) {
- Vector x = database.get(id).getColumnVector();
- List<Double> probabilities = new ArrayList<Double>(k);
+ for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ Vector x = database.get(iditer).getColumnVector();
+ double[] probabilities = new double[k];
for(int i = 0; i < k; i++) {
Vector difference = x.minus(means.get(i));
double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference);
double power = rowTimesCovTimesCol / 2.0;
- double prob = normDistrFactor.get(i) * Math.exp(-power);
+ double prob = normDistrFactor[i] * Math.exp(-power);
if(logger.isDebuggingFinest()) {
logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " "));
}
- probabilities.add(prob);
+ probabilities[i] = prob;
}
double priorProbability = 0.0;
for(int i = 0; i < k; i++) {
- priorProbability += probabilities.get(i) * clusterWeights.get(i);
+ priorProbability += probabilities[i] * clusterWeights[i];
}
double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD);
if(!Double.isNaN(logP)) {
@@ -337,16 +341,16 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
double[] clusterProbabilities = new double[k];
for(int i = 0; i < k; i++) {
assert (priorProbability >= 0.0);
- assert (clusterWeights.get(i) >= 0.0);
+ assert (clusterWeights[i] >= 0.0);
// do not divide by zero!
if(priorProbability == 0.0) {
clusterProbabilities[i] = 0.0;
}
else {
- clusterProbabilities[i] = probabilities.get(i) / priorProbability * clusterWeights.get(i);
+ clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i];
}
}
- probClusterIGivenX.put(id, clusterProbabilities);
+ probClusterIGivenX.put(iditer, clusterProbabilities);
}
return emSum;
@@ -358,7 +362,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
* @param index Point ID
* @return Probabilities of given point
*/
- public double[] getProbClusterIGivenX(DBID index) {
+ public double[] getProbClusterIGivenX(DBIDRef index) {
return probClusterIGivenX.get(index);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
index 2244b07b..04b57081 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
@@ -31,6 +31,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
@@ -142,22 +143,22 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && DoubleDistance.class.isInstance(epsilon)) {
// Optimized codepath for double-based distances. Avoids Java
// boxing/unboxing.
- for(DBID id : relation.iterDBIDs()) {
- if(!processedIDs.contains(id)) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(!processedIDs.contains(iditer)) {
// We need to do some ugly casts to be able to run the optimized version, unfortunately.
@SuppressWarnings("unchecked")
final ClusterOrderResult<DoubleDistance> doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder);
@SuppressWarnings("unchecked")
final RangeQuery<O, DoubleDistance> doubleRangeQuery = RangeQuery.class.cast(rangeQuery);
final DoubleDistance depsilon = DoubleDistance.class.cast(epsilon);
- expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, id, depsilon, progress);
+ expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, iditer.getDBID(), depsilon, progress);
}
}
}
else {
- for(DBID id : relation.iterDBIDs()) {
- if(!processedIDs.contains(id)) {
- expandClusterOrder(clusterOrder, database, rangeQuery, id, epsilon, progress);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(!processedIDs.contains(iditer)) {
+ expandClusterOrder(clusterOrder, database, rangeQuery, iditer.getDBID(), epsilon, progress);
}
}
}
@@ -194,7 +195,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
D coreDistance = last.getDistance();
for(DistanceResultPair<D> neighbor : neighbors) {
- if(processedIDs.contains(neighbor.getDBID())) {
+ if(processedIDs.contains(neighbor)) {
continue;
}
D reachability = DistanceUtil.max(neighbor.getDistance(), coreDistance);
@@ -234,7 +235,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
double coreDistance = ((DoubleDistanceResultPair) last).getDoubleDistance();
for(DistanceResultPair<DoubleDistance> neighbor : neighbors) {
- if(processedIDs.contains(neighbor.getDBID())) {
+ if(processedIDs.contains(neighbor)) {
continue;
}
double reachability = Math.max(((DoubleDistanceResultPair) neighbor).getDoubleDistance(), coreDistance);
@@ -247,7 +248,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
double coreDistance = last.getDistance().doubleValue();
for(DistanceResultPair<DoubleDistance> neighbor : neighbors) {
- if(processedIDs.contains(neighbor.getDBID())) {
+ if(processedIDs.contains(neighbor)) {
continue;
}
double reachability = Math.max(neighbor.getDistance().doubleValue(), coreDistance);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java
index d6c5872a..3ead6f3e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java
@@ -39,7 +39,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult;
*/
public interface OPTICSTypeAlgorithm<D extends Distance<D>> extends Algorithm {
@Override
- ClusterOrderResult<D> run(Database database) throws IllegalStateException;
+ ClusterOrderResult<D> run(Database database);
/**
* Get the minpts value used. Needed for OPTICS Xi etc.
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java
index 45b12c43..2aa38bdd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java
@@ -27,7 +27,6 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -49,6 +48,8 @@ import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -144,7 +145,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size());
// apply the algorithm
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ DBID id = iditer.getDBID();
step1(id);
step2(id, processedIDs, distQuery, m);
step3(id, processedIDs, m);
@@ -200,7 +202,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
* @param distFunc Distance function to use
*/
private void step2(DBID newID, DBIDs processedIDs, DistanceQuery<O, D> distFunc, WritableDataStore<D> m) {
- for(DBID id : processedIDs) {
+ for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) {
+ DBID id = it.getDBID();
// M(i) = dist(i, n+1)
m.put(id, distFunc.distance(id, newID));
}
@@ -215,7 +218,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
*/
private void step3(DBID newID, DBIDs processedIDs, WritableDataStore<D> m) {
// for i = 1..n
- for(DBID id : processedIDs) {
+ for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) {
+ DBID id = it.getDBID();
D l_i = lambda.get(id);
D m_i = m.get(id);
DBID p_i = pi.get(id);
@@ -247,7 +251,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
*/
private void step4(DBID newID, DBIDs processedIDs) {
// for i = 1..n
- for(DBID id : processedIDs) {
+ for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) {
+ DBID id = it.getDBID();
D l_i = lambda.get(id);
D lp_i = lambda.get(pi.get(id));
@@ -303,7 +308,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
// extract the child clusters
Map<DBID, ModifiableDBIDs> cluster_ids = new HashMap<DBID, ModifiableDBIDs>();
Map<DBID, D> cluster_distances = new HashMap<DBID, D>();
- for(DBID id : ids) {
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ DBID id = it.getDBID();
DBID lastObjectInCluster = lastObjectInCluster(id, stopdist, pi, lambda);
ModifiableDBIDs cluster = cluster_ids.get(lastObjectInCluster);
if(cluster == null) {
@@ -387,7 +393,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
}
// right child
DBID rightID = pi.get(leftID);
- if(leftID.equals(rightID)) {
+ if(leftID.sameDBID(rightID)) {
break;
}
Cluster<DendrogramModel<D>> right = nodes.get(rightID);
@@ -472,11 +478,12 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), logger) : null;
- for(DBID cur : order) {
- DBID dest = pi.get(cur);
- D l = lambda.get(cur);
+ for(DBIDIter it = order.iter(); it.valid(); it.advance()) {
+ DBID dest = pi.get(it);
+ D l = lambda.get(it);
// logger.debugFine("DBID " + cur.toString() + " dist: " + l.toString());
if(stopdist != null && stopdist.compareTo(l) > 0) {
+ DBID cur = it.getDBID();
ModifiableDBIDs curset = cids.remove(cur);
ModifiableDBIDs destset = cids.get(dest);
if(destset == null) {
@@ -511,13 +518,11 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
Cluster<Model> cluster = new Cluster<Model>(cname, clusids, ClusterModel.CLUSTER, hier);
// Collect child clusters and clean up the cluster ids, keeping only
// "new" objects.
- Iterator<DBID> iter = clusids.iterator();
- while(iter.hasNext()) {
- DBID child = iter.next();
- Cluster<Model> chiclus = clusters.get(child);
+ for(DBIDMIter iter = clusids.iter(); iter.valid(); iter.advance()) {
+ Cluster<Model> chiclus = clusters.get(iter);
if(chiclus != null) {
hier.add(cluster, chiclus);
- clusters.remove(child);
+ clusters.remove(iter);
iter.remove();
}
}
@@ -545,7 +550,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
cids.put(dest, destset);
destset.add(dest);
}
- destset.add(cur);
+ destset.add(it);
}
}
// Decrement counter
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
index 7c3a13c9..ae612b2a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
@@ -37,6 +37,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
@@ -154,9 +155,9 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
noise = DBIDUtil.newHashSet();
processedIDs = DBIDUtil.newHashSet(relation.size());
if(relation.size() >= minpts) {
- for(DBID id : snnInstance.getRelation().iterDBIDs()) {
+ for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) {
if(!processedIDs.contains(id)) {
- expandCluster(snnInstance, id, objprog, clusprog);
+ expandCluster(snnInstance, id.getDBID(), objprog, clusprog);
if(processedIDs.size() == relation.size() && noise.size() == 0) {
break;
}
@@ -168,7 +169,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
}
}
else {
- for(DBID id : snnInstance.getRelation().iterDBIDs()) {
+ for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) {
noise.add(id);
if(objprog != null && clusprog != null) {
objprog.setProcessed(noise.size(), logger);
@@ -202,9 +203,9 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
*/
protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) {
ArrayModifiableDBIDs neighbors = DBIDUtil.newArray();
- for(DBID id : snnInstance.getRelation().iterDBIDs()) {
- if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) {
- neighbors.add(id);
+ for(DBIDIter iditer = snnInstance.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(snnInstance.similarity(queryObject, iditer).compareTo(epsilon) >= 0) {
+ neighbors.add(iditer);
}
}
return neighbors;
@@ -237,7 +238,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
- for(DBID seed : seeds) {
+ for(DBIDIter seed = seeds.iter(); seed.valid(); seed.advance()) {
if(!processedIDs.contains(seed)) {
currentCluster.add(seed);
processedIDs.add(seed);
@@ -253,7 +254,8 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o);
if(neighborhood.size() >= minpts) {
- for(DBID p : neighborhood) {
+ for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
+ DBID p = iter.getDBID();
boolean inNoise = noise.contains(p);
boolean unclassified = !processedIDs.contains(p);
if(inNoise || unclassified) {
@@ -283,9 +285,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
resultList.add(currentCluster);
}
else {
- for(DBID id : currentCluster) {
- noise.add(id);
- }
+ noise.addDBIDs(currentCluster);
noise.add(startObjectID);
processedIDs.add(startObjectID);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
index b877415e..e4c6a123 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
@@ -48,7 +48,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -86,9 +86,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
* Provides the CASH algorithm, an subspace clustering algorithm based on the
* Hough transform.
*
- * <b>Note:</b> CASH requires explicitly setting the input parser other than default to
- * {@link de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser}:
- * (in the MiniGui, set option: dbc.parser ParameterizationFunctionLabelParser).
+ * <b>Note:</b> CASH requires explicitly setting the input vector type to
+ * {@link ParameterizationFunction}:
+ * (in the MiniGui, set option: parser.vector-type ParameterizationFunction).
*
* <p>
* Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust
@@ -503,9 +503,9 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
proxy.addRelation(prep);
// Project
- for(DBID id : ids) {
- ParameterizationFunction f = project(basis, relation.get(id));
- prep.set(id, f);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ ParameterizationFunction f = project(basis, relation.get(iter));
+ prep.set(iter, f);
}
if(logger.isDebugging()) {
@@ -662,8 +662,8 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
double d_min = Double.POSITIVE_INFINITY;
double d_max = Double.NEGATIVE_INFINITY;
- for(DBID id : relation.iterDBIDs()) {
- ParameterizationFunction f = relation.get(id);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ ParameterizationFunction f = relation.get(iditer);
HyperBoundingBox minMax = f.determineAlphaMinMax(box);
double f_min = f.function(SpatialUtil.getMin(minMax));
double f_max = f.function(SpatialUtil.getMax(minMax));
@@ -709,11 +709,11 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
ids.addDBIDs(interval.getIDs());
// Search for nearby vectors in original database
- for(DBID id : relation.iterDBIDs()) {
- DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef());
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ DoubleVector v = new DoubleVector(relation.get(iditer).getColumnVector().getArrayRef());
DoubleDistance d = df.distance(v, centroid);
if(d.compareTo(eps) < 0) {
- ids.add(id);
+ ids.add(iditer);
}
}
@@ -735,15 +735,15 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, CASHInterval interval) throws UnableToComplyException {
DBIDs ids = interval.getIDs();
ProxyDatabase proxy = new ProxyDatabase(ids);
- int dim = relation.get(ids.iterator().next()).getDimensionality();
+ int dim = DatabaseUtil.dimensionality(relation);
SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dim, new DoubleVector(new double[dim]));
MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids);
proxy.addRelation(prep);
// Project
- for(DBID id : ids) {
- DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef());
- prep.set(id, v);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef());
+ prep.set(iter, v);
}
if(logger.isDebugging()) {
@@ -800,15 +800,15 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
*/
private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, DBIDs ids) throws UnableToComplyException {
ProxyDatabase proxy = new ProxyDatabase(ids);
- int dim = relation.get(ids.iterator().next()).getDimensionality();
+ int dim = DatabaseUtil.dimensionality(relation);
SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dim, new DoubleVector(new double[dim]));
MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids);
proxy.addRelation(prep);
// Project
- for(DBID id : ids) {
- DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef());
- prep.set(id, v);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef());
+ prep.set(iter, v);
}
return proxy;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
index 575bf117..1d41d37e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
@@ -41,6 +41,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -176,7 +177,7 @@ public class COPAC<V extends NumberVector<V, ?>, D extends Distance<D>> extends
* @return Clustering result
*/
@SuppressWarnings("unchecked")
- public Clustering<Model> run(Relation<V> relation) throws IllegalStateException {
+ public Clustering<Model> run(Relation<V> relation) {
if(logger.isVerbose()) {
logger.verbose("Running COPAC on db size = " + relation.size() + " with dimensionality = " + DatabaseUtil.dimensionality(relation));
}
@@ -189,14 +190,14 @@ public class COPAC<V extends NumberVector<V, ?>, D extends Distance<D>> extends
FiniteProgress partitionProgress = logger.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), logger) : null;
int processed = 1;
- for(DBID id : relation.iterDBIDs()) {
- Integer corrdim = preprocin.getLocalProjection(id).getCorrelationDimension();
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ int corrdim = preprocin.getLocalProjection(iditer).getCorrelationDimension();
if(!partitionMap.containsKey(corrdim)) {
partitionMap.put(corrdim, DBIDUtil.newArray());
}
- partitionMap.get(corrdim).add(id);
+ partitionMap.get(corrdim).add(iditer);
if(partitionProgress != null) {
partitionProgress.setProcessed(processed++, logger);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
index af4f677f..b57a6e29 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
@@ -118,7 +118,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste
* @param relation Relation to process
* @return Clustering result
*/
- public Clustering<CorrelationModel<V>> run(Relation<V> relation) throws IllegalStateException {
+ public Clustering<CorrelationModel<V>> run(Relation<V> relation) {
final int dimensionality = DatabaseUtil.dimensionality(relation);
StepProgress stepprog = logger.isVerbose() ? new StepProgress(3) : null;
@@ -291,7 +291,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste
return parameters;
}
- private void buildHierarchy(SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) throws IllegalStateException {
+ private void buildHierarchy(SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) {
StringBuffer msg = new StringBuffer();
DBSCAN<V, DoubleDistance> dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query));
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
index 41ee1f69..b8942de8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
@@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -175,9 +175,9 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
- for(DBID id : current) {
- if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
- subset.add(id);
+ for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
+ if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
+ subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
@@ -265,16 +265,16 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
int remaining_retries = 100;
for(int i = 1; i <= samples; i++) {
DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong());
- final Iterator<DBID> iter = sample.iterator();
+ final DBIDIter iter = sample.iter();
// Use first as origin
- DBID origin = iter.next();
- Vector originV = relation.get(origin).getColumnVector();
+ Vector originV = relation.get(iter).getColumnVector();
+ iter.advance();
// Build orthogonal basis from remainder
Matrix basis;
{
List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1);
- while(iter.hasNext()) {
- Vector vec = relation.get(iter.next()).getColumnVector();
+ for(;iter.valid(); iter.advance()) {
+ Vector vec = relation.get(iter).getColumnVector();
vectors.add(vec.minusEquals(originV));
}
// generate orthogonal basis
@@ -292,12 +292,12 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
// Generate and fill a histogram.
FlexiHistogram<Double, Double> histogram = FlexiHistogram.DoubleSumHistogram(BINS);
double w = 1.0 / currentids.size();
- for(DBID point : currentids) {
+ for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) {
// Skip sampled points
- if(sample.contains(point)) {
+ if(sample.contains(iter2)) {
continue;
}
- Vector vec = relation.get(point).getColumnVector().minusEquals(originV);
+ Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV);
final double distance = deviation(vec, basis);
histogram.aggregate(distance, w);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
index eb5608fc..2e9f4a9b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
@@ -38,6 +38,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -139,8 +140,11 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
/**
* Performs the ORCLUS algorithm on the given database.
+ *
+ * @param database Database
+ * @param relation Relation
*/
- public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<Model> run(Database database, Relation<V> relation) {
try {
DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database);
// current dimensionality associated with each seed
@@ -211,8 +215,8 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, seed);
V factory = DatabaseUtil.assumeVectorField(database).getFactory();
List<ORCLUSCluster> seeds = new ArrayList<ORCLUSCluster>();
- for(DBID id : randomSample) {
- seeds.add(new ORCLUSCluster(database.get(id), id, factory));
+ for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) {
+ seeds.add(new ORCLUSCluster(database.get(iter), iter.getDBID(), factory));
}
return seeds;
}
@@ -240,10 +244,8 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
}
// for each data point o do
- Iterator<DBID> it = database.iterDBIDs();
- while(it.hasNext()) {
- DBID id = it.next();
- V o = database.get(id);
+ for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ V o = database.get(it);
DoubleDistance minDist = null;
ORCLUSCluster minCluster = null;
@@ -260,7 +262,7 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
}
// add p to the cluster with the least value of projected distance
assert minCluster != null;
- minCluster.objectIDs.add(id);
+ minCluster.objectIDs.add(it);
}
// recompute the seed in each clusters
@@ -285,10 +287,9 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
// covariance matrix of cluster
// Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs);
List<DistanceResultPair<DoubleDistance>> results = new ArrayList<DistanceResultPair<DoubleDistance>>(cluster.objectIDs.size());
- for(Iterator<DBID> it = cluster.objectIDs.iterator(); it.hasNext();) {
- DBID id = it.next();
- DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(id));
- DistanceResultPair<DoubleDistance> qr = new GenericDistanceResultPair<DoubleDistance>(distance, id);
+ for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) {
+ DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it));
+ DistanceResultPair<DoubleDistance> qr = new GenericDistanceResultPair<DoubleDistance>(distance, it.getDBID());
results.add(qr);
}
Collections.sort(results);
@@ -407,9 +408,8 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
DoubleDistance sum = getDistanceFunction().getDistanceFactory().nullDistance();
V c_proj = projection(c_ij, c_ij.centroid, factory);
- for(DBID id : c_ij.objectIDs) {
- V o = database.get(id);
- V o_proj = projection(c_ij, o, factory);
+ for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) {
+ V o_proj = projection(c_ij, database.get(iter), factory);
DoubleDistance dist = distFunc.distance(o_proj, c_proj);
sum = sum.plus(dist.times(dist));
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java
index 86e045cb..b0a12832 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java
@@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.HyperBoundingBox;
import de.lmu.ifi.dbs.elki.data.ParameterizationFunction;
import de.lmu.ifi.dbs.elki.data.spatial.SpatialUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -114,7 +115,8 @@ public class CASHIntervalSplit {
f_maxima.put(interval, maxima);
}
- for(DBID id : superSetIDs) {
+ for(DBIDIter iter = superSetIDs.iter(); iter.valid(); iter.advance()) {
+ DBID id = iter.getDBID();
Double f_min = minima.get(id);
Double f_max = maxima.get(id);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java
new file mode 100644
index 00000000..e75a89dc
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java
@@ -0,0 +1,80 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+
+/**
+ * Predicate for GeneralizedDBSCAN to evaluate whether a point is a core point
+ * or not.
+ *
+ * Note the Factory/Instance split of this interface.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has Instance
+ */
+public interface CorePredicate {
+ /**
+ * Constant for the generic type {@code List<? extends DistanceResultPair<?>>}
+ */
+ public static final String NEIGHBOR_LIST = "neighborhood-list";
+
+ /**
+ * Instantiate for a database.
+ *
+ * @param database Database to instantiate for
+ * @param type Type to instantiate for
+ * @return Instance
+ */
+ public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type);
+
+ /**
+ * Test whether the neighborhood type T is accepted by this predicate.
+ *
+ * @param type Type information
+ * @return true when the type is accepted
+ */
+ public boolean acceptsType(SimpleTypeInformation<?> type);
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ *
+ * @param <T> actual type
+ */
+ public static interface Instance<T> {
+ /**
+ * Decide whether the point is a core point, based on its neighborhood.
+ *
+ * @param point Query point
+ * @param neighbors Neighbors
+ * @return core point property
+ */
+ public boolean isCorePoint(DBIDRef point, T neighbors);
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
new file mode 100644
index 00000000..cb24e8f1
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
@@ -0,0 +1,268 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.QueryUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DistanceDBIDResult;
+import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * The default DBSCAN and OPTICS neighbor predicate, using an
+ * epsilon-neighborhood.
+ *
+ * <p>
+ * Reference: <br>
+ * M. Ester, H.-P. Kriegel, J. Sander, and X. Xu: A Density-Based Algorithm for
+ * Discovering Clusters in Large Spatial Databases with Noise. <br>
+ * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96),
+ * Portland, OR, 1996.
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @param <D> Distance type
+ */
+@Reference(authors = "M. Ester, H.-P. Kriegel, J. Sander, and X. Xu", title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.71.1980")
+public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements NeighborPredicate {
+ /**
+ * Range to query with
+ */
+ D epsilon;
+
+ /**
+ * Distance function to use
+ */
+ DistanceFunction<O, D> distFunc;
+
+ /**
+ * Full constructor.
+ *
+ * @param epsilon Epsilon value
+ * @param distFunc Distance function to use
+ */
+ public EpsilonNeighborPredicate(D epsilon, DistanceFunction<O, D> distFunc) {
+ super();
+ this.epsilon = epsilon;
+ this.distFunc = distFunc;
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type) {
+ if(TypeUtil.DBIDS.isAssignableFromType(type)) {
+ DistanceQuery<O, D> dq = QueryUtil.getDistanceQuery(database, distFunc);
+ RangeQuery<O, D> rq = database.getRangeQuery(dq);
+ return (Instance<T>) new DBIDInstance<D>(epsilon, rq, dq.getRelation().getDBIDs());
+ }
+ if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) {
+ DistanceQuery<O, D> dq = QueryUtil.getDistanceQuery(database, distFunc);
+ RangeQuery<O, D> rq = database.getRangeQuery(dq);
+ return (Instance<T>) new NeighborListInstance<D>(epsilon, rq, dq.getRelation().getDBIDs());
+ }
+ throw new AbortException("Incompatible predicate types");
+ }
+
+ @Override
+ public SimpleTypeInformation<?>[] getOutputType() {
+ return new SimpleTypeInformation<?>[] { TypeUtil.DBIDS, TypeUtil.NEIGHBORLIST };
+ }
+
+ @Override
+ public TypeInformation getInputTypeRestriction() {
+ return distFunc.getInputTypeRestriction();
+ }
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ */
+ public static class DBIDInstance<D extends Distance<D>> implements NeighborPredicate.Instance<DBIDs> {
+ /**
+ * Range to query with
+ */
+ D epsilon;
+
+ /**
+ * Range query to use on the database.
+ */
+ RangeQuery<?, D> rq;
+
+ /**
+ * DBIDs to process
+ */
+ DBIDs ids;
+
+ /**
+ * Constructor.
+ *
+ * @param epsilon Epsilon
+ * @param rq Range query to use
+ * @param ids DBIDs to process
+ */
+ public DBIDInstance(D epsilon, RangeQuery<?, D> rq, DBIDs ids) {
+ super();
+ this.epsilon = epsilon;
+ this.rq = rq;
+ this.ids = ids;
+ }
+
+ @Override
+ public DBIDs getIDs() {
+ return ids;
+ }
+
+ @Override
+ public DBIDs getNeighbors(DBIDRef reference) {
+ List<DistanceResultPair<D>> res = rq.getRangeForDBID(reference, epsilon);
+ // Throw away the actual distance values ...
+ ModifiableDBIDs neighbors = DBIDUtil.newHashSet(res.size());
+ for(DistanceResultPair<D> dr : res) {
+ neighbors.add(dr);
+ }
+ return neighbors;
+ }
+
+ @Override
+ public void addDBIDs(ModifiableDBIDs ids, DBIDs neighbors) {
+ ids.addDBIDs(neighbors);
+ }
+ }
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ */
+ public static class NeighborListInstance<D extends Distance<D>> implements NeighborPredicate.Instance<DistanceDBIDResult<D>> {
+ /**
+ * Range to query with
+ */
+ D epsilon;
+
+ /**
+ * Range query to use on the database.
+ */
+ RangeQuery<?, D> rq;
+
+ /**
+ * DBIDs to process
+ */
+ DBIDs ids;
+
+ /**
+ * Constructor.
+ *
+ * @param epsilon Epsilon
+ * @param rq Range query to use
+ * @param ids DBIDs to process
+ */
+ public NeighborListInstance(D epsilon, RangeQuery<?, D> rq, DBIDs ids) {
+ super();
+ this.epsilon = epsilon;
+ this.rq = rq;
+ this.ids = ids;
+ }
+
+ @Override
+ public DBIDs getIDs() {
+ return ids;
+ }
+
+ @Override
+ public DistanceDBIDResult<D> getNeighbors(DBIDRef reference) {
+ return rq.getRangeForDBID(reference, epsilon);
+ }
+
+ @Override
+ public void addDBIDs(ModifiableDBIDs ids, DistanceDBIDResult<D> neighbors) {
+ for(DistanceResultPair<D> neighbor : neighbors) {
+ ids.add(neighbor);
+ }
+ }
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O, D extends Distance<D>> extends AbstractParameterizer {
+ /**
+ * Range to query with
+ */
+ D epsilon;
+
+ /**
+ * Distance function to use
+ */
+ DistanceFunction<O, D> distfun = null;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ // Get a distance function.
+ ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<DistanceFunction<O, D>>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
+ D distanceFactory = null;
+ if(config.grab(distanceP)) {
+ distfun = distanceP.instantiateClass(config);
+ distanceFactory = distfun.getDistanceFactory();
+ }
+ // Get the epsilon parameter
+ DistanceParameter<D> epsilonP = new DistanceParameter<D>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory);
+ if(config.grab(epsilonP)) {
+ epsilon = epsilonP.getValue();
+ }
+ }
+
+ @Override
+ protected EpsilonNeighborPredicate<O, D> makeInstance() {
+ return new EpsilonNeighborPredicate<O, D>(epsilon, distfun);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java
new file mode 100644
index 00000000..2e1c2093
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java
@@ -0,0 +1,323 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.list.array.TIntArrayList;
+
+import java.util.ArrayList;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
+import de.lmu.ifi.dbs.elki.data.model.Model;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Generalized DBSCAN, density-based clustering with noise.
+ * <p>
+ * Reference:<br />
+ * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu:<br />
+ * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its
+ * Applications<br />
+ * In: Data Mining and Knowledge Discovery, 1998.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @author Arthur Zimek
+ *
+ * @apiviz.has Instance
+ */
+@Reference(authors = "Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu", title = "Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications", booktitle = "Data Mining and Knowledge Discovery", url = "http://dx.doi.org/10.1023/A:1009745219419")
+public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
+ /**
+ * Get a logger for this algorithm
+ */
+ final static Logging logger = Logging.getLogger(GeneralizedDBSCAN.class);
+
+ /**
+ * The neighborhood predicate factory.
+ */
+ NeighborPredicate npred;
+
+ /**
+ * The core predicate factory.
+ */
+ CorePredicate corepred;
+
+ /**
+ * Constructor for parameterized algorithm.
+ *
+ * @param npred Neighbor predicate
+ * @param corepred Core point predicate
+ */
+ public GeneralizedDBSCAN(NeighborPredicate npred, CorePredicate corepred) {
+ super();
+ this.npred = npred;
+ this.corepred = corepred;
+ }
+
+ @Override
+ public Clustering<Model> run(Database database) {
+ for (SimpleTypeInformation<?> t : npred.getOutputType()) {
+ if (corepred.acceptsType(t)) {
+ return new Instance<Object>(npred.instantiate(database, t), corepred.instantiate(database, t)).run();
+ }
+ }
+ throw new AbortException("No compatible types found.");
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(npred.getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ */
+ public class Instance<T> {
+ /**
+ * The neighborhood predicate
+ */
+ final NeighborPredicate.Instance<T> npred;
+
+ /**
+ * The core object property
+ */
+ final CorePredicate.Instance<T> corepred;
+
+ /**
+ * Full Constructor
+ *
+ * @param npred Neighborhood predicate
+ * @param corepred Core object predicate
+ */
+ public Instance(NeighborPredicate.Instance<T> npred, CorePredicate.Instance<T> corepred) {
+ super();
+ this.npred = npred;
+ this.corepred = corepred;
+ }
+
+ /**
+ * Run the actual DBSCAN algorithm.
+ *
+ * @return Clustering result
+ */
+ public Clustering<Model> run() {
+ final DBIDs ids = npred.getIDs();
+ // Setup progress logging
+ final FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", ids.size(), logger) : null;
+ final IndefiniteProgress clusprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters", logger) : null;
+ // (Temporary) store the cluster ID assigned.
+ final WritableIntegerDataStore clusterids = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -2);
+ // Note: these are not exact!
+ final TIntArrayList clustersizes = new TIntArrayList();
+
+ // Implementation Note: using Integer objects should result in
+ // reduced memory use in the HashMap!
+ final int noiseid = -1;
+ int clusterid = 0;
+ int clustersize = 0;
+ int noisesize = 0;
+ // Iterate over all objects in the database.
+ for(DBIDIter id = ids.iter(); id.valid(); id.advance()) {
+ // Skip already processed ids.
+ if(clusterids.intValue(id) > -2) {
+ continue;
+ }
+ // Evaluate Neighborhood predicate
+ final T neighbors = npred.getNeighbors(id);
+ // Evaluate Core-Point predicate:
+ if(corepred.isCorePoint(id, neighbors)) {
+ clusterids.putInt(id, clusterid);
+ clustersize = 1 + setbasedExpandCluster(clusterid, clusterids, neighbors, progress);
+ // start next cluster on next iteration.
+ clustersizes.add(clustersize);
+ clustersize = 0;
+ clusterid += 1;
+ if(clusprogress != null) {
+ clusprogress.setProcessed(clusterid, logger);
+ }
+ }
+ else {
+ // otherwise, it's a noise point
+ clusterids.putInt(id, noiseid);
+ noisesize += 1;
+ }
+ // We've completed this element
+ if(progress != null) {
+ progress.incrementProcessed(logger);
+ }
+ }
+ // Finish progress logging.
+ if(progress != null) {
+ progress.ensureCompleted(logger);
+ }
+ if(clusprogress != null) {
+ clusprogress.setCompleted(logger);
+ }
+
+ // Transform cluster ID mapping into a clustering result:
+ ArrayList<ArrayModifiableDBIDs> clusterlists = new ArrayList<ArrayModifiableDBIDs>(clusterid + 1);
+ // add noise cluster storage
+ clusterlists.add(DBIDUtil.newArray(noisesize));
+ // add storage containers for clusters
+ for(int i = 0; i < clustersizes.size(); i++) {
+ clusterlists.add(DBIDUtil.newArray(clustersizes.get(i)));
+ }
+ // do the actual inversion
+ for(DBIDIter id = ids.iter(); id.valid(); id.advance()) {
+ int cluster = clusterids.intValue(id);
+ clusterlists.get(cluster + 1).add(id);
+ }
+ clusterids.destroy();
+
+ Clustering<Model> result = new Clustering<Model>("GDBSCAN", "gdbscan-clustering");
+ int cid = 0;
+ for(ArrayModifiableDBIDs res : clusterlists) {
+ boolean isNoise = (cid == 0);
+ Cluster<Model> c = new Cluster<Model>(res, isNoise, ClusterModel.CLUSTER);
+ result.addCluster(c);
+ cid++;
+ }
+ return result;
+ }
+
+ /**
+ * Set-based expand cluster implementation.
+ *
+ * @param clusterid ID of the current cluster.
+ * @param clusterids Current object to cluster mapping.
+ * @param neighbors Neighbors acquired by initial getNeighbors call.
+ * @param progress Progress logging
+ *
+ * @return cluster size;
+ */
+ protected int setbasedExpandCluster(final int clusterid, final WritableIntegerDataStore clusterids, final T neighbors, final FiniteProgress progress) {
+ int clustersize = 0;
+ final ArrayModifiableDBIDs activeSet = DBIDUtil.newArray();
+ npred.addDBIDs(activeSet, neighbors);
+ // run expandCluster as long as this set is non-empty (non-recursive
+ // implementation)
+ while(!activeSet.isEmpty()) {
+ final DBID id = activeSet.remove(activeSet.size() - 1);
+ clustersize += 1;
+ // Assign object to cluster
+ final int oldclus = clusterids.putInt(id, clusterid);
+ if(oldclus == -2) {
+ // expandCluster again:
+ // Evaluate Neighborhood predicate
+ final T newneighbors = npred.getNeighbors(id);
+ // Evaluate Core-Point predicate
+ if(corepred.isCorePoint(id, newneighbors)) {
+ // Note: the recursion is unrolled into iteration over the active
+ // set.
+ npred.addDBIDs(activeSet, newneighbors);
+ }
+ if(progress != null) {
+ progress.incrementProcessed(logger);
+ }
+ }
+ }
+ return clustersize;
+ }
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Neighborhood predicate
+ */
+ NeighborPredicate npred = null;
+
+ /**
+ * Core point predicate
+ */
+ CorePredicate corepred = null;
+
+ /**
+ * Parameter for neighborhood predicate
+ */
+ public final static OptionID NEIGHBORHOODPRED_ID = OptionID.getOrCreateOptionID("gdbscan.neighborhood", "Neighborhood predicate for GDBSCAN");
+
+ /**
+ * Parameter for core predicate
+ */
+ public final static OptionID COREPRED_ID = OptionID.getOrCreateOptionID("gdbscan.core", "Core point predicate for GDBSCAN");
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // Neighborhood predicate
+ ObjectParameter<NeighborPredicate> npredOpt = new ObjectParameter<NeighborPredicate>(NEIGHBORHOODPRED_ID, NeighborPredicate.class, EpsilonNeighborPredicate.class);
+ if(config.grab(npredOpt)) {
+ npred = npredOpt.instantiateClass(config);
+ }
+
+ // Core point predicate
+ ObjectParameter<CorePredicate> corepredOpt = new ObjectParameter<CorePredicate>(COREPRED_ID, CorePredicate.class, MinPtsCorePredicate.class);
+ if(config.grab(corepredOpt)) {
+ corepred = corepredOpt.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected GeneralizedDBSCAN makeInstance() {
+ return new GeneralizedDBSCAN(npred, corepred);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
new file mode 100644
index 00000000..b9852eca
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
@@ -0,0 +1,178 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * The DBSCAN default core point predicate -- having at least {@link #minpts}
+ * neighbors.
+ *
+ * <p>
+ * Reference: <br>
+ * M. Ester, H.-P. Kriegel, J. Sander, and X. Xu: A Density-Based Algorithm for
+ * Discovering Clusters in Large Spatial Databases with Noise. <br>
+ * In Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96),
+ * Portland, OR, 1996.
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has Instance
+ */
+@Reference(authors = "M. Ester, H.-P. Kriegel, J. Sander, and X. Xu", title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", booktitle = "Proc. 2nd Int. Conf. on Knowledge Discovery and Data Mining (KDD '96), Portland, OR, 1996", url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.71.1980")
+public class MinPtsCorePredicate implements CorePredicate {
+ /**
+ * The minpts parameter.
+ */
+ int minpts;
+
+ /**
+ * Default constructor.
+ *
+ * @param minpts Minimum number of neighbors to be a core point.
+ */
+ public MinPtsCorePredicate(int minpts) {
+ super();
+ this.minpts = minpts;
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type) {
+ if(TypeUtil.DBIDS.isAssignableFromType(type)) {
+ return (Instance<T>) new DBIDsInstance(minpts);
+ }
+ if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) {
+ return (Instance<T>) new NeighborListInstance(minpts);
+ }
+ throw new AbortException("Incompatible predicate types");
+ }
+
+ @Override
+ public boolean acceptsType(SimpleTypeInformation<?> type) {
+ if(TypeUtil.DBIDS.isAssignableFromType(type)) {
+ return true;
+ }
+ if(TypeUtil.NEIGHBORLIST.isAssignableFromType(type)) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ */
+ public static class DBIDsInstance implements CorePredicate.Instance<DBIDs> {
+ /**
+ * The minpts parameter.
+ */
+ int minpts;
+
+ /**
+ * Constructor for this predicate.
+ *
+ * @param minpts MinPts parameter
+ */
+ public DBIDsInstance(int minpts) {
+ super();
+ this.minpts = minpts;
+ }
+
+ @Override
+ public boolean isCorePoint(DBIDRef point, DBIDs neighbors) {
+ return neighbors.size() >= minpts;
+ }
+ }
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ */
+ public static class NeighborListInstance implements CorePredicate.Instance<List<? extends DistanceResultPair<?>>> {
+ /**
+ * The minpts parameter.
+ */
+ int minpts;
+
+ /**
+ * Constructor for this predicate.
+ *
+ * @param minpts MinPts parameter
+ */
+ public NeighborListInstance(int minpts) {
+ super();
+ this.minpts = minpts;
+ }
+
+ @Override
+ public boolean isCorePoint(DBIDRef point, List<? extends DistanceResultPair<?>> neighbors) {
+ return neighbors.size() >= minpts;
+ }
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Minpts value
+ */
+ int minpts;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ // Get the minpts parameter
+ IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID);
+ if(config.grab(minptsP)) {
+ minpts = minptsP.getValue();
+ }
+ }
+
+ @Override
+ protected MinPtsCorePredicate makeInstance() {
+ return new MinPtsCorePredicate(minpts);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java
new file mode 100644
index 00000000..4f9eca27
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java
@@ -0,0 +1,94 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+
+/**
+ * Get the neighbors of an object
+ *
+ * Note the Factory/Instance split of this interface.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has Instance
+ */
+public interface NeighborPredicate {
+ /**
+ * Instantiate for a database.
+ *
+ * @param database Database to instantiate for
+ * @return Instance
+ */
+ public <T> Instance<T> instantiate(Database database, SimpleTypeInformation<?> type);
+
+ /**
+ * Input data type restriction.
+ *
+ * @return Type restriction
+ */
+ public TypeInformation getInputTypeRestriction();
+
+ /**
+ * Output data type information.
+ *
+ * @return Type information
+ */
+ public SimpleTypeInformation<?>[] getOutputType();
+
+ /**
+ * Instance for a particular data set.
+ *
+ * @author Erich Schubert
+ */
+ public static interface Instance<T> {
+ /**
+ * Get the neighbors of a reference object for DBSCAN.
+ *
+ * @param reference Reference object
+ * @return Neighborhood
+ */
+ public T getNeighbors(DBIDRef reference);
+
+ /**
+ * Get the IDs the predicate is defined for.
+ *
+ * @return Database ids
+ */
+ public DBIDs getIDs();
+
+ /**
+ * Add the neighbors to a DBID set
+ *
+ * @param ids ID set
+ * @param neighbors Neighbors to add
+ */
+ public void addDBIDs(ModifiableDBIDs ids, T neighbors);
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java
new file mode 100644
index 00000000..8be23c7d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java
@@ -0,0 +1,43 @@
+/**
+ * <p>Generalized DBSCAN.</p>
+ *
+ * Generalized DBSCAN is an abstraction of the original DBSCAN idea,
+ * that allows the use of arbitrary "neighborhood" and "core point" predicates.
+ *
+ * For each object, the neighborhood as defined by the "neighborhood" predicate
+ * is retrieved - in original DBSCAN, this is the objects within an epsilon sphere
+ * around the query object. Then the core point predicate is evaluated to decide if
+ * the object is considered dense. If so, a cluster is started (or extended) to
+ * include the neighbors as well.
+ *
+ * <p>
+ * Reference:<br />
+ * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu:<br />
+ * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its
+ * Applications<br />
+ * In: Data Mining and Knowledge Discovery, 1998.
+ * </p>
+ */
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index d3c73b53..92862909 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -1,24 +1,5 @@
package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
-import de.lmu.ifi.dbs.elki.data.Clustering;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
-import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
@@ -42,37 +23,39 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+
/**
* Abstract base class for k-means implementations.
*
* @author Erich Schubert
*
+ * @apiviz.composedOf KMeansInitialization
+ *
* @param <V> Vector type
* @param <D> Distance type
*/
-public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> {
- /**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater than 0.
- */
- public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
-
- /**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater or equal to 0, where 0 means no limit.
- */
- public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
-
- /**
- * Parameter to specify the random generator seed.
- */
- public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
-
- /**
- * Parameter to specify the initialization method
- */
- public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
-
+public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> implements KMeans {
/**
* Holds the value of {@link #K_ID}.
*/
@@ -94,6 +77,7 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
* @param distanceFunction distance function
* @param k k parameter
* @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
*/
public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
super(distanceFunction);
@@ -111,15 +95,15 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
* @param clusters cluster assignment
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) {
+ protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?, ?>> means, List<? extends ModifiableDBIDs> clusters) {
boolean changed = false;
if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
@@ -128,13 +112,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
mindist = dist;
}
}
- if(clusters.get(minIndex).add(id)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
for(int i = 0; i < k; i++) {
if(i != minIndex) {
- if(clusters.get(i).remove(id)) {
+ if(clusters.get(i).remove(iditer)) {
break;
}
}
@@ -144,9 +128,9 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
}
else {
final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
@@ -155,13 +139,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
mindist = dist;
}
}
- if(clusters.get(minIndex).add(id)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
for(int i = 0; i < k; i++) {
if(i != minIndex) {
- if(clusters.get(i).remove(id)) {
+ if(clusters.get(i).remove(iditer)) {
break;
}
}
@@ -185,25 +169,23 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
* @param database the database containing the vectors
* @return the mean vectors of the given clusters in the given database
*/
- protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) {
+ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> means, Relation<V> database) {
List<Vector> newMeans = new ArrayList<Vector>(k);
for(int i = 0; i < k; i++) {
ModifiableDBIDs list = clusters.get(i);
Vector mean = null;
- for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) {
- if(mean == null) {
- mean = database.get(clusterIter.next()).getColumnVector();
- }
- else {
- mean.plusEquals(database.get(clusterIter.next()).getColumnVector());
- }
- }
if(list.size() > 0) {
- assert mean != null;
- mean.timesEquals(1.0 / list.size());
+ double s = 1.0 / list.size();
+ DBIDIter iter = list.iter();
+ assert (iter.valid());
+ mean = database.get(iter).getColumnVector().timesEquals(s);
+ iter.advance();
+ for(; iter.valid(); iter.advance()) {
+ mean.plusTimesEquals(database.get(iter).getColumnVector(), s);
+ }
}
else {
- mean = means.get(i);
+ mean = means.get(i).getColumnVector();
}
newMeans.add(mean);
}
@@ -211,6 +193,36 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
}
/**
+ * Returns the median vectors of the given clusters in the given database.
+ *
+ * @param clusters the clusters to compute the means
+ * @param medians the recent medians
+ * @param database the database containing the vectors
+ * @return the mean vectors of the given clusters in the given database
+ */
+ protected List<NumberVector<?, ?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?, ?>> medians, Relation<V> database) {
+ final int dim = medians.get(0).getDimensionality();
+ final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database);
+ List<NumberVector<?, ?>> newMedians = new ArrayList<NumberVector<?, ?>>(k);
+ for(int i = 0; i < k; i++) {
+ ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i));
+ if(list.size() > 0) {
+ Vector mean = new Vector(dim);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d + 1);
+ DBID id = QuickSelect.median(list, sorter);
+ mean.set(d, database.get(id).doubleValue(d + 1));
+ }
+ newMedians.add(mean);
+ }
+ else {
+ newMedians.add((NumberVector<?, ?>) medians.get(i));
+ }
+ }
+ return newMedians;
+ }
+
+ /**
* Compute an incremental update for the mean
*
* @param mean Mean to update
@@ -239,16 +251,16 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
*/
protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
boolean changed = false;
-
+
if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
// Raw distance function
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
-
+
// Incremental update
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
@@ -261,13 +273,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
for(int i = 0; i < k; i++) {
ModifiableDBIDs ci = clusters.get(i);
if(i == minIndex) {
- if(ci.add(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ if(ci.add(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
changed = true;
}
}
- else if(ci.remove(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ else if(ci.remove(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
changed = true;
}
}
@@ -276,11 +288,11 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
else {
// Raw distance function
final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
-
+
// Incremental update
- for(DBID id : relation.iterDBIDs()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
- V fv = relation.get(id);
+ V fv = relation.get(iditer);
int minIndex = 0;
for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
@@ -293,13 +305,13 @@ public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Dis
for(int i = 0; i < k; i++) {
ModifiableDBIDs ci = clusters.get(i);
if(i == minIndex) {
- if(ci.add(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ if(ci.add(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
changed = true;
}
}
- else if(ci.remove(id)) {
- incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ else if(ci.remove(iditer)) {
+ incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
changed = true;
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
index b5f088fb..a8effecd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
@@ -22,7 +22,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
@@ -34,9 +33,9 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
*
* @param <V> Vector type
*/
-public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> {
+public abstract class AbstractKMeansInitialization<V> implements KMeansInitialization<V> {
/**
- * Holds the value of {@link KMeansLloyd#SEED_ID}.
+ * Holds the value of {@link KMeans#SEED_ID}.
*/
protected Long seed;
@@ -56,13 +55,13 @@ public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>>
*
* @apiviz.exclude
*/
- public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ public abstract static class Parameterizer<V> extends AbstractParameterizer {
protected Long seed;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true);
+ LongParameter seedP = new LongParameter(KMeans.SEED_ID, true);
if(config.grab(seedP)) {
seed = seedP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
index 78ccd426..7a7f2867 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
@@ -23,14 +23,16 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
@@ -40,20 +42,30 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
*
* @param <V> Vector type
*/
-public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsInitialization<V> {
/**
* Constructor.
*/
public FirstKInitialMeans() {
- super(null);
+ super();
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
- Iterator<DBID> iter = relation.iterDBIDs();
- List<Vector> means = new ArrayList<Vector>(k);
- for(int i = 0; i < k && iter.hasNext(); i++) {
- means.add(relation.get(iter.next()).getColumnVector());
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ DBIDIter iter = relation.iterDBIDs();
+ List<V> means = new ArrayList<V>(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(relation.get(iter));
+ }
+ return means;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) {
+ DBIDIter iter = distanceFunction.getRelation().iterDBIDs();
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+ for(int i = 0; i < k && iter.valid(); i++, iter.advance()) {
+ means.add(iter);
}
return means;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
new file mode 100644
index 00000000..37171d4a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java
@@ -0,0 +1,55 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Some constants and options shared among kmeans family algorithms.
+ *
+ * @author Erich Schubert
+ */
+public interface KMeans {
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
+
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater or equal to 0, where 0 means no limit.
+ */
+ public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
+
+ /**
+ * Parameter to specify the random generator seed.
+ */
+ public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
index f4c0d9c7..9e5d69f0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
@@ -24,19 +24,17 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
*/
import java.util.List;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
/**
* Interface for initializing K-Means
*
* @author Erich Schubert
*
- * @param <V> Vector type
+ * @param <V> Object type
*/
-public interface KMeansInitialization<V extends NumberVector<V, ?>> {
+public interface KMeansInitialization<V> {
/**
* Choose initial means
*
@@ -45,5 +43,5 @@ public interface KMeansInitialization<V extends NumberVector<V, ?>> {
* @param distanceFunction Distance function
* @return List of chosen means for k-means
*/
- public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
+ public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index fda1d6c0..b1b40632 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -39,7 +39,6 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
@@ -94,14 +93,13 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex
* @param database Database
* @param relation relation to use
* @return result
- * @throws IllegalStateException
*/
- public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
if(relation.size() <= 0) {
return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ List<? extends NumberVector<?, ?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
for(int i = 0; i < k; i++) {
@@ -124,7 +122,7 @@ public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> ex
final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
for(int i = 0; i < clusters.size(); i++) {
- MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
}
return result;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index 56492dd0..c729eb10 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -93,15 +93,17 @@ public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>>
*
* @param database Database
* @param relation relation to use
- * @return result
- * @throws IllegalStateException
+ * @return Clustering result
*/
- public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
if(relation.size() <= 0) {
return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
- List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ List<Vector> means = new ArrayList<Vector>(k);
+ for(NumberVector<?, ?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) {
+ means.add(nv.getColumnVector());
+ }
// Initialize cluster and assign objects
List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
for(int i = 0; i < k; i++) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
index c7a2fa1d..9afeff6c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -26,19 +26,18 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Random;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
@@ -59,7 +58,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
* @param <D> Distance type
*/
@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494")
-public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> {
+public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
/**
* Constructor.
*
@@ -70,7 +69,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
// Get a distance query
if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("K-Means++ initialization can only be used with numerical distances.");
@@ -80,14 +79,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
// Chose first mean
- List<Vector> means = new ArrayList<Vector>(k);
+ List<V> means = new ArrayList<V>(k);
Random random = (seed != null) ? new Random(seed) : new Random();
- DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next();
- means.add(relation.get(first).getColumnVector());
+ DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iter().getDBID();
+ means.add(relation.get(first));
- ModifiableDBIDs chosen = DBIDUtil.newHashSet(k);
- chosen.add(first);
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
// Initialize weights
double[] weights = new double[ids.size()];
@@ -107,16 +104,16 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
}
// Add new mean:
DBID newmean = ids.get(pos);
- means.add(relation.get(newmean).getColumnVector());
- chosen.add(newmean);
+ means.add(relation.get(newmean));
// Update weights:
weights[pos] = 0.0;
// Choose optimized version for double distances, if applicable.
- if (distF instanceof PrimitiveDoubleDistanceFunction) {
+ if(distF instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF;
weightsum = updateWeights(weights, ids, newmean, ddist, relation);
- } else {
+ }
+ else {
weightsum = updateWeights(weights, ids, newmean, distQ);
}
}
@@ -124,6 +121,48 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
return means;
}
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
+ if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("PAM initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
+ // Chose first mean
+ ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
+
+ Random random = (seed != null) ? new Random(seed) : new Random();
+ DBID first = DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random.nextLong()).iter().getDBID();
+ means.add(first);
+
+ ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs());
+ // Initialize weights
+ double[] weights = new double[ids.size()];
+ double weightsum = initialWeights(weights, ids, first, distQ);
+ while(means.size() < k) {
+ if(weightsum > Double.MAX_VALUE) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?");
+ }
+ if(weightsum < Double.MIN_NORMAL) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?");
+ }
+ double r = random.nextDouble() * weightsum;
+ int pos = 0;
+ while(r > 0 && pos < weights.length) {
+ r -= weights[pos];
+ pos++;
+ }
+ // Add new mean:
+ DBID newmean = ids.get(pos);
+ means.add(newmean);
+ // Update weights:
+ weights[pos] = 0.0;
+ weightsum = updateWeights(weights, ids, newmean, distQ);
+ }
+
+ return means;
+ }
+
/**
* Initialize the weight list.
*
@@ -133,16 +172,15 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
* @param distQ Distance query
* @return Weight sum
*/
- protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) {
double weightsum = 0.0;
DBIDIter it = ids.iter();
for(int i = 0; i < weights.length; i++, it.advance()) {
- DBID id = it.getDBID();
- if(latest.equals(id)) {
+ if(latest.sameDBID(it)) {
weights[i] = 0.0;
}
else {
- double d = distQ.distance(latest, id).doubleValue();
+ double d = distQ.distance(latest, it).doubleValue();
weights[i] = d * d;
}
weightsum += weights[i];
@@ -159,13 +197,12 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
* @param distQ Distance query
* @return Weight sum
*/
- protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<? super V, D> distQ) {
double weightsum = 0.0;
DBIDIter it = ids.iter();
for(int i = 0; i < weights.length; i++, it.advance()) {
- DBID id = it.getDBID();
if(weights[i] > 0.0) {
- double d = distQ.distance(latest, id).doubleValue();
+ double d = distQ.distance(latest, it).doubleValue();
weights[i] = Math.min(weights[i], d * d);
weightsum += weights[i];
}
@@ -187,9 +224,8 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
double weightsum = 0.0;
DBIDIter it = ids.iter();
for(int i = 0; i < weights.length; i++, it.advance()) {
- DBID id = it.getDBID();
if(weights[i] > 0.0) {
- double d = distF.doubleDistance(lv, rel.get(id));
+ double d = distF.doubleDistance(lv, rel.get(it));
weights[i] = Math.min(weights[i], d * d);
weightsum += weights[i];
}
@@ -204,7 +240,7 @@ public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
@Override
protected KMeansPlusPlusInitialMeans<V, D> makeInstance() {
return new KMeansPlusPlusInitialMeans<V, D>(seed);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
new file mode 100644
index 00000000..8c284981
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -0,0 +1,172 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-medians clustering algorithm, using Lloyd-style bulk
+ * iterations.
+ *
+ * Reference:
+ * <p>
+ * Clustering via Concave Minimization<br />
+ * P. S. Bradley, O. L. Mangasarian, W. N. Street<br />
+ * in: Advances in neural information processing systems
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+@Title("K-Medians")
+@Reference(title = "Clustering via Concave Minimization", authors = "P. S. Bradley, O. L. Mangasarian, W. N. Street", booktitle = "Advances in neural information processing systems", url="http://nips.djvuzone.org/djvu/nips09/0368.djvu")
+public class KMediansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMediansLloyd.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public KMediansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Run k-medians
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering");
+ }
+ // Choose initial medians
+ List<? extends NumberVector<?, ?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if(logger.isVerbose()) {
+ logger.verbose("K-Medians iteration " + (iteration + 1));
+ }
+ boolean changed = assignToNearestCluster(relation, medians, clusters);
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute medians.
+ medians = medians(clusters, medians, relation);
+ }
+ // Wrap result
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef()));
+ result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected AbstractKMeans<V, D> makeInstance() {
+ return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
new file mode 100644
index 00000000..a5c3d675
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -0,0 +1,271 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-medoids clustering algorithm, using a "bulk" variation of the
+ * "Partitioning Around Medoids" approach.
+ *
+ * In contrast to PAM, which will in each iteration update one medoid with one
+ * (arbitrary) non-medoid, this implementation follows the EM pattern. In the
+ * expectation step, the best medoid from the cluster members is chosen; in the
+ * M-step, the objects are reassigned to their nearest medoid.
+ *
+ * We do not have a reference for this algorithm. It borrows ideas from EM and
+ * PAM. If needed, you are welcome cite it using the latest ELKI publication
+ * (this variation is likely not worth publishing on its own).
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MedoidModel
+ * @apiviz.composedOf KMedoidsInitialization
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMedoidsEM.class);
+
+ /**
+ * Holds the value of {@link AbstractKMeans#K_ID}.
+ */
+ protected int k;
+
+ /**
+ * Holds the value of {@link AbstractKMeans#MAXITER_ID}.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMedoidsInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
+ */
+ public KMedoidsEM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Run k-medoids
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ }
+ DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ // Choose initial medoids
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+ Mean[] mdists = Mean.newArray(k);
+
+ // Initial assignment to nearest medoids
+ // TODO: reuse this information, from the build phase, when possible?
+ assignToNearestCluster(medoids, mdists, clusters, distQ);
+
+ // Swap phase
+ boolean changed = true;
+ while(changed) {
+ changed = false;
+ // Try to swap the medoid with a better cluster member:
+ for(int i = 0; i < k; i++) {
+ DBID med = medoids.get(i);
+ DBID best = null;
+ Mean bestm = mdists[i];
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(med.sameDBID(iter)) {
+ continue;
+ }
+ Mean mdist = new Mean();
+ for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
+ mdist.put(distQ.distance(iter, iter2).doubleValue());
+ }
+ if(mdist.getMean() < bestm.getMean()) {
+ best = iter.getDBID();
+ bestm = mdist;
+ }
+ }
+ if(best != null && !med.sameDBID(best)) {
+ changed = true;
+ medoids.set(i, best);
+ mdists[i] = bestm;
+ }
+ }
+ // Reassign
+ if(changed) {
+ assignToNearestCluster(medoids, mdists, clusters, distQ);
+ }
+ }
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MedoidModel model = new MedoidModel(medoids.get(i));
+ result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param means a list of k means
+ * @param mdist Mean distances
+ * @param clusters cluster assignment
+ * @param distQ distance query
+ * @return true when the object was reassigned
+ */
+ protected boolean assignToNearestCluster(ArrayDBIDs means, Mean[] mdist, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ boolean changed = false;
+
+ double[] dists = new double[k];
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ int minIndex = 0;
+ double mindist = Double.POSITIVE_INFINITY;
+ for(int i = 0; i < k; i++) {
+ dists[i] = distQ.distance(iditer, means.get(i)).doubleValue();
+ if(dists[i] < mindist) {
+ minIndex = i;
+ mindist = dists[i];
+ }
+ }
+ if(clusters.get(minIndex).add(iditer)) {
+ changed = true;
+ mdist[minIndex].put(mindist);
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
+ mdist[minIndex].put(dists[i], -1);
+ break;
+ }
+ }
+ }
+ }
+ }
+ return changed;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMedoidsInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected KMedoidsEM<V, D> makeInstance() {
+ return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
new file mode 100644
index 00000000..269e7e9e
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java
@@ -0,0 +1,45 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+
+/**
+ * Interface for initializing K-Medoids. In contrast to k-means initializers,
+ * this initialization will only return members of the original data set.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Object type
+ */
+public interface KMedoidsInitialization<V> {
+ /**
+ * Choose initial means
+ *
+ * @param k Parameter k
+ * @param distanceFunction Distance function
+ * @return List of chosen means for k-means
+ */
+ public abstract DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
new file mode 100644
index 00000000..30c80084
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -0,0 +1,310 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-medoids clustering algorithm, using the
+ * "Partitioning Around Medoids" approach.
+ *
+ * Reference:
+ * <p>
+ * Clustering my means of Medoids<br />
+ * Kaufman, L. and Rousseeuw, P.J.<br />
+ * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MedoidModel
+ * @apiviz.composedOf KMedoidsInitialization
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+@Title("Partioning Around Medoids")
+@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
+public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMedoidsPAM.class);
+
+ /**
+ * Holds the value of {@link AbstractKMeans#K_ID}.
+ */
+ protected int k;
+
+ /**
+ * Holds the value of {@link AbstractKMeans#MAXITER_ID}.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMedoidsInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
+ */
+ public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Run k-medoids
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ }
+ DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ DBIDs ids = relation.getDBIDs();
+ // Choose initial medoids
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+
+ WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ // Initial assignment to nearest medoids
+ // TODO: reuse this information, from the build phase, when possible?
+ assignToNearestCluster(medoids, ids, second, clusters, distQ);
+
+ // Swap phase
+ boolean changed = true;
+ while(changed) {
+ changed = false;
+ // Try to swap the medoid with a better cluster member:
+ double best = 0;
+ DBID bestid = null;
+ int bestcluster = -1;
+ for(int i = 0; i < k; i++) {
+ DBID med = medoids.get(i);
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(med.sameDBID(iter)) {
+ continue;
+ }
+ // double disti = distQ.distance(id, med).doubleValue();
+ double cost = 0;
+ for(int j = 0; j < k; j++) {
+ for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
+ double distcur = distQ.distance(iter2, medoids.get(j)).doubleValue();
+ double distnew = distQ.distance(iter2, iter).doubleValue();
+ if(j == i) {
+ // Cases 1 and 2.
+ double distsec = second.doubleValue(iter2);
+ if(distcur > distsec) {
+ // Case 1, other would switch to a third medoid
+ cost += distsec - distcur; // Always positive!
+ }
+ else { // Would remain with the candidate
+ cost += distnew - distcur; // Could be negative
+ }
+ }
+ else {
+ // Cases 3-4: objects from other clusters
+ if (distcur < distnew) {
+ // Case 3: no change
+ } else {
+ // Case 4: would switch to new medoid
+ cost += distnew - distcur; // Always negative
+ }
+ }
+ }
+ }
+ if (cost < best) {
+ best = cost;
+ bestid = iter.getDBID();
+ bestcluster = i;
+ }
+ }
+ }
+ if(logger.isDebugging()) {
+ logger.debug("Best cost: " + best);
+ }
+ if(bestid != null) {
+ changed = true;
+ medoids.set(bestcluster, bestid);
+ }
+ // Reassign
+ if(changed) {
+ // TODO: can we save some of these recomputations?
+ assignToNearestCluster(medoids, ids, second, clusters, distQ);
+ }
+ }
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MedoidModel model = new MedoidModel(medoids.get(i));
+ result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param means Object centroids
+ * @param ids Object ids
+ * @param second Distance to second nearest medoid
+ * @param clusters cluster assignment
+ * @param distQ distance query
+ * @return true when any object was reassigned
+ */
+ protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ boolean changed = false;
+
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ int minIndex = 0;
+ double mindist = Double.POSITIVE_INFINITY;
+ double mindist2 = Double.POSITIVE_INFINITY;
+ for(int i = 0; i < k; i++) {
+ double dist = distQ.distance(iditer, means.get(i)).doubleValue();
+ if(dist < mindist) {
+ minIndex = i;
+ mindist2 = mindist;
+ mindist = dist;
+ }
+ else if(dist < mindist2) {
+ mindist2 = dist;
+ }
+ }
+ if(clusters.get(minIndex).add(iditer)) {
+ changed = true;
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
+ break;
+ }
+ }
+ }
+ }
+ second.put(iditer, mindist2);
+ }
+ return changed;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMedoidsInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(KMeans.K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected KMedoidsPAM<V, D> makeInstance() {
+ return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
new file mode 100644
index 00000000..094c37bb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java
@@ -0,0 +1,187 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * PAM initialization for k-means (and of course, PAM).
+ *
+ * Reference:
+ * <p>
+ * Clustering my means of Medoids<br />
+ * Kaufman, L. and Rousseeuw, P.J.<br />
+ * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
+ * </p>
+ *
+ * TODO: enforce using a distance matrix?
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
+public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMeansInitialization<V>, KMedoidsInitialization<V> {
+ /**
+ * Constructor.
+ */
+ public PAMInitialMeans() {
+ super();
+ }
+
+ @Override
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ // Get a distance query
+ if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("PAM initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
+ final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
+ DBIDs medids = chooseInitialMedoids(k, distQ);
+ List<V> medoids = new ArrayList<V>(k);
+ for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) {
+ medoids.add(relation.get(iter));
+ }
+ return medoids;
+ }
+
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
+ if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("PAM initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2;
+ final DBIDs ids = distQ.getRelation().getDBIDs();
+
+ ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
+ double best = Double.POSITIVE_INFINITY;
+ Mean mean = new Mean(); // Mean is numerically more stable than sum.
+ WritableDoubleDataStore mindist = null;
+
+ // First mean is chosen by having the smallest distance sum to all others.
+ {
+ DBID bestid = null;
+ WritableDoubleDataStore bestd = null;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ mean.reset();
+ for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ double d = distQ.distance(iter, iter2).doubleValue();
+ mean.put(d);
+ newd.putDouble(iter2, d);
+ }
+ if(mean.getMean() < best) {
+ best = mean.getMean();
+ bestid = iter.getDBID();
+ if(bestd != null) {
+ bestd.destroy();
+ }
+ bestd = newd;
+ }
+ else {
+ newd.destroy();
+ }
+ }
+ medids.add(bestid);
+ mindist = bestd;
+ }
+ assert (mindist != null);
+
+ // Subsequent means optimize the full criterion.
+ for(int i = 1; i < k; i++) {
+ DBID bestid = null;
+ WritableDoubleDataStore bestd = null;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ DBID id = iter.getDBID();
+ if(medids.contains(id)) {
+ continue;
+ }
+ WritableDoubleDataStore newd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ mean.reset();
+ for(DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
+ DBID other = iter2.getDBID();
+ double dn = distQ.distance(id, other).doubleValue();
+ double v = Math.min(dn, mindist.doubleValue(other));
+ mean.put(v);
+ newd.put(other, v);
+ }
+ assert (mean.getCount() == ids.size());
+ if(mean.getMean() < best) {
+ best = mean.getMean();
+ bestid = id;
+ if(bestd != null) {
+ bestd.destroy();
+ }
+ bestd = newd;
+ }
+ else {
+ newd.destroy();
+ }
+ }
+ if(bestid == null) {
+ throw new AbortException("No median found that improves the criterion function?!?");
+ }
+ medids.add(bestid);
+ mindist.destroy();
+ mindist = bestd;
+ }
+
+ mindist.destroy();
+ return medids;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ @Override
+ protected PAMInitialMeans<V, D> makeInstance() {
+ return new PAMInitialMeans<V, D>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
index 30e59453..5b9da923 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
@@ -25,13 +25,12 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
import java.util.ArrayList;
import java.util.List;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
/**
* Initialize K-means by randomly choosing k exsiting elements as cluster
@@ -41,7 +40,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
*
* @param <V> Vector type
*/
-public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> {
/**
* Constructor.
*
@@ -52,15 +51,20 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed);
- List<Vector> means = new ArrayList<Vector>(k);
- for(DBID id : ids) {
- means.add(relation.get(id).getColumnVector());
+ List<V> means = new ArrayList<V>(k);
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ means.add(relation.get(iter));
}
return means;
}
+ @Override
+ public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distanceFunction) {
+ return DBIDUtil.randomSample(distanceFunction.getRelation().getDBIDs(), k, seed);
+ }
+
/**
* Parameterization class.
*
@@ -68,7 +72,7 @@ public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends Ab
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> {
@Override
protected RandomlyChosenInitialMeans<V> makeInstance() {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
index e8a466dd..00ed08c4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -30,7 +30,6 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
import de.lmu.ifi.dbs.elki.math.MathUtil;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@@ -53,10 +52,10 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends
}
@Override
- public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
final int dim = DatabaseUtil.dimensionality(relation);
Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
- List<Vector> means = new ArrayList<Vector>(k);
+ List<V> means = new ArrayList<V>(k);
final Random random = (this.seed != null) ? new Random(this.seed) : new Random();
for(int i = 0; i < k; i++) {
double[] r = MathUtil.randomDoubleArray(dim, random);
@@ -64,12 +63,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends
for(int d = 0; d < dim; d++) {
r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d];
}
- means.add(new Vector(r));
+ means.add(minmax.first.newNumberVector(r));
}
return means;
}
-
/**
* Parameterization class.
*
@@ -78,7 +76,6 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends
* @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
-
@Override
protected RandomlyGeneratedInitialMeans<V> makeInstance() {
return new RandomlyGeneratedInitialMeans<V>(seed);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
index e3b274a6..01a693e4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
@@ -27,14 +27,12 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace;
import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUEUnit;
import de.lmu.ifi.dbs.elki.data.Cluster;
@@ -46,6 +44,7 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -97,7 +96,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@Title("CLIQUE: Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications")
@Description("Grid-based algorithm to identify dense clusters in subspaces of maximum dimensionality.")
@Reference(authors = "R. Agrawal, J. Gehrke, D. Gunopulos, P. Raghavan", title = "Automatic Subspace Clustering of High Dimensional Data for Data Mining Applications", booktitle = "Proc. SIGMOD Conference, Seattle, WA, 1998", url = "http://dx.doi.org/10.1145/276304.276314")
-public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> {
+public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
/**
* The logger for this class.
*/
@@ -299,8 +298,8 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus
minima[d] = Double.MAX_VALUE;
}
// update minima and maxima
- for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) {
- V featureVector = database.get(it.next());
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ V featureVector = database.get(it);
updateMinMax(featureVector, minima, maxima);
}
for(int i = 0; i < maxima.length; i++) {
@@ -393,13 +392,15 @@ public class CLIQUE<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus
// identify dense units
double total = database.size();
- for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) {
- final DBID id = it.next();
- V featureVector = database.get(id);
+ for(DBIDIter it = database.iterDBIDs(); it.valid();) {
+ V featureVector = database.get(it);
+ final DBID id = it.getDBID();
+ it.advance();
for(CLIQUEUnit<V> unit : units) {
unit.addFeatureVector(id, featureVector);
// unit is a dense unit
- if(!it.hasNext() && unit.selectivity(total) >= tau) {
+ // FIXME: why it.valid()?
+ if(!it.valid() && unit.selectivity(total) >= tau) {
denseUnits.add(unit);
// add the dense unit to its subspace
int dim = unit.getIntervals().iterator().next().getDimension();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
index c4c1687b..df3fe8b5 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
@@ -34,7 +34,6 @@ import java.util.List;
import java.util.Map;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
@@ -100,7 +99,7 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
@Title("DiSH: Detecting Subspace cluster Hierarchies")
@Description("Algorithm to find hierarchical correlation clusters in subspaces.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15")
-public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> {
+public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
/**
* The logger for this class.
*/
@@ -162,8 +161,11 @@ public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste
/**
* Performs the DiSH algorithm on the given database.
+ *
+ * @param database Database to process
+ * @param relation Relation to process
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
// Instantiate DiSH distance (and thus run the preprocessor)
if(logger.isVerbose()) {
logger.verbose("*** Run DiSH preprocessor.");
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
index 3f16e907..4eedbecd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
@@ -28,7 +28,6 @@ import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
@@ -39,13 +38,13 @@ import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.Subspace;
-import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -87,11 +86,11 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
+// TODO: optimize by creating much less objects
@Title("PROCLUS: PROjected CLUStering")
@Description("Algorithm to find subspace clusters in high dimensional spaces.")
@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188")
-// TODO: make the generics reflect the SubspaceModel
-public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClustering<Clustering<Model>, V> {
+public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
/**
* The logger for this class.
*/
@@ -141,8 +140,11 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
/**
* Performs the PROCLUS algorithm on the given database.
+ *
+ * @param database Database to process
+ * @param relation Relation to process
*/
- public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException {
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database);
RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
final Random random = new Random();
@@ -193,6 +195,7 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Current number of clusters:", logger) : null;
+ // TODO: Use DataStore and Trove for performance
Map<DBID, PROCLUSCluster> clusters = null;
int loops = 0;
while(loops < 10) {
@@ -229,9 +232,9 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
// build result
int numClusters = 1;
- Clustering<Model> result = new Clustering<Model>("ProClus clustering", "proclus-clustering");
+ Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("ProClus clustering", "proclus-clustering");
for(PROCLUSCluster c : finalClusters) {
- Cluster<Model> cluster = new Cluster<Model>(c.objectIDs);
+ Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.objectIDs);
cluster.setModel(new SubspaceModel<V>(new Subspace<V>(c.getDimensions()), c.centroid));
cluster.setName("cluster_" + numClusters++);
@@ -262,7 +265,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
// compute distances between each point in S and m_i
Map<DBID, DistanceResultPair<DoubleDistance>> distances = new HashMap<DBID, DistanceResultPair<DoubleDistance>>();
- for(DBID id : s) {
+ for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) {
+ DBID id = iter.getDBID();
DoubleDistance dist = distFunc.distance(id, m_i);
distances.put(id, new GenericDistanceResultPair<DoubleDistance>(dist, id));
}
@@ -278,7 +282,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
distances.remove(m_i);
// compute distances of each point to closest medoid
- for(DBID id : s) {
+ for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) {
+ DBID id = iter.getDBID();
DoubleDistance dist_new = distFunc.distance(id, m_i);
DoubleDistance dist_old = distances.get(id).getDistance();
@@ -323,12 +328,11 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
*/
private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) {
ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m);
- for(DBID m_i : m_best) {
- m_list.remove(m_i);
- }
+ m_list.removeDBIDs(m_best);
ModifiableDBIDs m_current = DBIDUtil.newHashSet();
- for(DBID m_i : m_best) {
+ for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) {
+ DBID m_i = iter.getDBID();
if(m_bad.contains(m_i)) {
int currentSize = m_current.size();
while(m_current.size() == currentSize) {
@@ -358,11 +362,13 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
private Map<DBID, List<DistanceResultPair<DoubleDistance>>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) {
Map<DBID, List<DistanceResultPair<DoubleDistance>>> result = new HashMap<DBID, List<DistanceResultPair<DoubleDistance>>>();
- for(DBID m : medoids) {
+ for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
+ DBID m = iter.getDBID();
// determine minimum distance between current medoid m and any other
// medoid m_i
DoubleDistance minDist = null;
- for(DBID m_i : medoids) {
+ for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) {
+ DBID m_i = iter2.getDBID();
if(m_i == m) {
continue;
}
@@ -399,7 +405,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
int dim = DatabaseUtil.dimensionality(database);
Map<DBID, double[]> averageDistances = new HashMap<DBID, double[]>();
- for(DBID m_i : medoids) {
+ for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
+ DBID m_i = iter.getDBID();
V medoid_i = database.get(m_i);
List<DistanceResultPair<DoubleDistance>> l_i = localities.get(m_i);
double[] x_i = new double[dim];
@@ -417,7 +424,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
Map<DBID, Set<Integer>> dimensionMap = new HashMap<DBID, Set<Integer>>();
List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<CTriple<Double, DBID, Integer>>();
- for(DBID m_i : medoids) {
+ for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
+ DBID m_i = iter.getDBID();
Set<Integer> dims_i = new HashSet<Integer>();
dimensionMap.put(m_i, dims_i);
@@ -478,8 +486,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
for(int i = 0; i < clusters.size(); i++) {
PROCLUSCluster c_i = clusters.get(i);
double[] x_i = new double[dim];
- for(DBID id : c_i.objectIDs) {
- V o = database.get(id);
+ for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) {
+ V o = database.get(iter);
for(int d = 0; d < dim; d++) {
x_i[d] += Math.abs(c_i.centroid.doubleValue(d + 1) - o.doubleValue(d + 1));
}
@@ -560,8 +568,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
clusterIDs.put(m_i, DBIDUtil.newHashSet());
}
- for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) {
- DBID p_id = it.next();
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ DBID p_id = it.getDBID();
V p = database.get(p_id);
DistanceResultPair<DoubleDistance> minDist = null;
for(DBID m_i : dimensions.keySet()) {
@@ -610,8 +618,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
clusterIDs.put(i, DBIDUtil.newHashSet());
}
- for(Iterator<DBID> it = database.iterDBIDs(); it.hasNext();) {
- DBID p_id = it.next();
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ DBID p_id = it.getDBID();
V p = database.get(p_id);
Pair<DoubleDistance, Integer> minDist = null;
for(int i = 0; i < dimensions.size(); i++) {
@@ -707,8 +715,8 @@ public class PROCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClus
*/
private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) {
double avg = 0;
- for(DBID objectID : objectIDs) {
- V o = database.get(objectID);
+ for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) {
+ V o = database.get(iter);
avg += Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension));
}
return avg / objectIDs.size();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
index 963c0922..c47c74b6 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
@@ -30,7 +30,6 @@ import java.util.List;
import java.util.TreeMap;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
@@ -77,7 +76,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Elke Achtert
*
* @apiviz.uses DBSCAN
- * @apiviz.uses DimensionsSelectingEuclideanDistanceFunction
+ * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction
* @apiviz.has SubspaceModel
*
* @param <V> the type of FeatureVector handled by this Algorithm
@@ -85,7 +84,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@Title("SUBCLU: Density connected Subspace Clustering")
@Description("Algorithm to detect arbitrarily shaped and positioned clusters in subspaces. SUBCLU delivers for each subspace the same clusters DBSCAN would have found, when applied to this subspace seperately.")
@Reference(authors = "K. Kailing, H.-P. Kriegel, P. Kröger", title = "Density connected Subspace Clustering for High Dimensional Data. ", booktitle = "Proc. SIAM Int. Conf. on Data Mining (SDM'04), Lake Buena Vista, FL, 2004")
-public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> {
+public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
/**
* The logger for this class.
*/
@@ -162,7 +161,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus
* @param relation Relation to process
* @return Clustering result
*/
- public Clustering<SubspaceModel<V>> run(Relation<V> relation) throws IllegalStateException {
+ public Clustering<SubspaceModel<V>> run(Relation<V> relation) {
final int dimensionality = DatabaseUtil.dimensionality(relation);
StepProgress stepprog = logger.isVerbose() ? new StepProgress(dimensionality) : null;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java
new file mode 100644
index 00000000..17eb3c19
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java
@@ -0,0 +1,39 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+
+/**
+ * Interface for subspace clustering algorithms that use a model derived from
+ * {@link SubspaceModel}, that can then be post-processed for outlier detection.
+ *
+ * @author Erich Schubert
+ *
+ * @param <M> Model type
+ */
+public interface SubspaceClusteringAlgorithm<M extends SubspaceModel<?>> extends ClusteringAlgorithm<Clustering<M>> {
+ // No additional constraints
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java
index 43c6a218..ee42a59f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java
@@ -39,7 +39,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -137,12 +141,12 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl
* @param relation The data input we use
*/
public Clustering<Model> run(Relation<?> relation) {
- HashMap<String, ModifiableDBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation);
+ HashMap<String, DBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation);
ModifiableDBIDs noiseids = DBIDUtil.newArray();
Clustering<Model> result = new Clustering<Model>("By Label Clustering", "bylabel-clustering");
- for(Entry<String, ModifiableDBIDs> entry : labelMap.entrySet()) {
- ModifiableDBIDs ids = entry.getValue();
+ for(Entry<String, DBIDs> entry : labelMap.entrySet()) {
+ DBIDs ids = entry.getValue();
if(ids.size() <= 1) {
noiseids.addDBIDs(ids);
continue;
@@ -170,12 +174,13 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl
* @param data the database storing the objects
* @return a mapping of labels to ids
*/
- private HashMap<String, ModifiableDBIDs> singleAssignment(Relation<?> data) {
- HashMap<String, ModifiableDBIDs> labelMap = new HashMap<String, ModifiableDBIDs>();
+ private HashMap<String, DBIDs> singleAssignment(Relation<?> data) {
+ HashMap<String, DBIDs> labelMap = new HashMap<String, DBIDs>();
- for(DBID id : data.iterDBIDs()) {
- String label = data.get(id).toString();
- assign(labelMap, label, id);
+ for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ final Object val = data.get(iditer);
+ String label = (val != null) ? val.toString() : null;
+ assign(labelMap, label, iditer);
}
return labelMap;
}
@@ -187,13 +192,13 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl
* @param data the database storing the objects
* @return a mapping of labels to ids
*/
- private HashMap<String, ModifiableDBIDs> multipleAssignment(Relation<?> data) {
- HashMap<String, ModifiableDBIDs> labelMap = new HashMap<String, ModifiableDBIDs>();
+ private HashMap<String, DBIDs> multipleAssignment(Relation<?> data) {
+ HashMap<String, DBIDs> labelMap = new HashMap<String, DBIDs>();
- for(DBID id : data.iterDBIDs()) {
- String[] labels = data.get(id).toString().split(" ");
+ for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ String[] labels = data.get(iditer).toString().split(" ");
for(String label : labels) {
- assign(labelMap, label, id);
+ assign(labelMap, label, iditer);
}
}
return labelMap;
@@ -206,14 +211,22 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl
* @param label the label of the object to be assigned
* @param id the id of the object to be assigned
*/
- private void assign(HashMap<String, ModifiableDBIDs> labelMap, String label, DBID id) {
+ private void assign(HashMap<String, DBIDs> labelMap, String label, DBIDRef id) {
if(labelMap.containsKey(label)) {
- labelMap.get(label).add(id);
+ DBIDs exist = labelMap.get(label);
+ if (exist instanceof DBID) {
+ ModifiableDBIDs n = DBIDUtil.newHashSet();
+ n.add((DBID)exist);
+ n.add(id);
+ labelMap.put(label, n);
+ } else {
+ assert(exist instanceof HashSetModifiableDBIDs);
+ assert (exist.size() > 1);
+ ((ModifiableDBIDs)exist).add(id);
+ }
}
else {
- ModifiableDBIDs n = DBIDUtil.newHashSet();
- n.add(id);
- labelMap.put(label, n);
+ labelMap.put(label, id.getDBID());
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java
index 228cc7e7..26bf525a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java
@@ -39,7 +39,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -96,28 +100,26 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering<
*
* @param relation The data input to use
*/
- public Clustering<Model> run(Relation<?> relation) throws IllegalStateException {
- HashMap<String, ModifiableDBIDs> labelmap = new HashMap<String, ModifiableDBIDs>();
+ public Clustering<Model> run(Relation<?> relation) {
+ HashMap<String, DBIDs> labelmap = new HashMap<String, DBIDs>();
ModifiableDBIDs noiseids = DBIDUtil.newArray();
- for(DBID id : relation.iterDBIDs()) {
- String label = relation.get(id).toString();
-
- if(labelmap.containsKey(label)) {
- labelmap.get(label).add(id);
- }
- else {
- ModifiableDBIDs n = DBIDUtil.newHashSet();
- n.add(id);
- labelmap.put(label, n);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ final Object val = relation.get(iditer);
+ if(val == null) {
+ noiseids.add(iditer);
+ continue;
}
+ String label = val.toString();
+
+ assign(labelmap, label, iditer);
}
ArrayList<Cluster<Model>> clusters = new ArrayList<Cluster<Model>>(labelmap.size());
- for(Entry<String, ModifiableDBIDs> entry : labelmap.entrySet()) {
- ModifiableDBIDs ids = entry.getValue();
- if(ids.size() <= 1) {
- noiseids.addDBIDs(ids);
+ for(Entry<String, DBIDs> entry : labelmap.entrySet()) {
+ DBIDs ids = entry.getValue();
+ if(ids instanceof DBID) {
+ noiseids.add((DBID) ids);
continue;
}
Cluster<Model> clus = new Cluster<Model>(entry.getKey(), ids, ClusterModel.CLUSTER, new ArrayList<Cluster<Model>>(), new ArrayList<Cluster<Model>>());
@@ -153,6 +155,33 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering<
return new Clustering<Model>("By Label Hierarchical Clustering", "bylabel-clustering", rootclusters);
}
+ /**
+ * Assigns the specified id to the labelMap according to its label
+ *
+ * @param labelMap the mapping of label to ids
+ * @param label the label of the object to be assigned
+ * @param id the id of the object to be assigned
+ */
+ private void assign(HashMap<String, DBIDs> labelMap, String label, DBIDRef id) {
+ if(labelMap.containsKey(label)) {
+ DBIDs exist = labelMap.get(label);
+ if(exist instanceof DBID) {
+ ModifiableDBIDs n = DBIDUtil.newHashSet();
+ n.add((DBID) exist);
+ n.add(id);
+ labelMap.put(label, n);
+ }
+ else {
+ assert (exist instanceof HashSetModifiableDBIDs);
+ assert (exist.size() > 1);
+ ((ModifiableDBIDs) exist).add(id);
+ }
+ }
+ else {
+ labelMap.put(label, id.getDBID());
+ }
+ }
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.GUESSED_LABEL);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java
new file mode 100644
index 00000000..f082db9c
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java
@@ -0,0 +1,74 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial;
+
+import de.lmu.ifi.dbs.elki.data.ClassLabel;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
+import de.lmu.ifi.dbs.elki.data.model.Model;
+import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Trivial class that will try to cluster by label, and fall back to an
+ * "all-in-one" clustering.
+ *
+ * @author Erich Schubert
+ */
+public class ByLabelOrAllInOneClustering extends ByLabelClustering {
+ /**
+ * Constructor.
+ */
+ public ByLabelOrAllInOneClustering() {
+ super();
+ }
+
+ @Override
+ public Clustering<Model> run(Database database) {
+ // Prefer a true class label
+ try {
+ Relation<ClassLabel> relation = database.getRelation(TypeUtil.CLASSLABEL);
+ return run(relation);
+ }
+ catch(NoSupportedDataTypeException e) {
+ // Ignore.
+ }
+ try {
+ Relation<ClassLabel> relation = database.getRelation(TypeUtil.GUESSED_LABEL);
+ return run(relation);
+ }
+ catch(NoSupportedDataTypeException e) {
+ // Ignore.
+ }
+ final DBIDs ids = database.getRelation(TypeUtil.ANY).getDBIDs();
+ Clustering<Model> result = new Clustering<Model>("All-in-one trivial Clustering", "allinone-clustering");
+ Cluster<Model> c = new Cluster<Model>(ids, ClusterModel.CLUSTER);
+ result.addCluster(c);
+ return result;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java
index cd45cda2..90ca3625 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java
@@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -102,14 +102,14 @@ public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> impl
public Clustering<Model> run(Relation<Model> relation) {
// Build model mapping
HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<Model, ModifiableDBIDs>();
- for(DBID id : relation.iterDBIDs()) {
- Model model = relation.get(id);
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ Model model = relation.get(iditer);
ModifiableDBIDs modelids = modelMap.get(model);
if(modelids == null) {
modelids = DBIDUtil.newHashSet();
modelMap.put(model, modelids);
}
- modelids.add(id);
+ modelids.add(iditer);
}
Clustering<Model> result = new Clustering<Model>("By Model Clustering", "bymodel-clustering");