summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java17
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java129
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java307
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java20
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java15
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java21
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java566
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java21
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java310
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java71
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java74
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java49
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java176
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java177
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java213
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java78
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java87
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java26
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java163
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java2
50 files changed, 2124 insertions, 481 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
index 5712d814..ea441655 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
index 95d88b93..108ba0ed 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -340,9 +340,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
}
}
- if(processedIDs.size() == distFunc.getRelation().size() && noise.size() == 0) {
+ /* if(processedIDs.size() == relation.size() && noise.size() == 0) {
break;
- }
+ } */
}
if(currentCluster.size() >= minpts) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java
index e28dbff3..5ec59777 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
index 2576c5f6..b59af555 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -33,7 +33,6 @@ import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
@@ -132,7 +131,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
/**
* Performs the DBSCAN algorithm on the given database.
*/
- public Clustering<Model> run(Database database, Relation<O> relation) {
+ public Clustering<Model> run(Relation<O> relation) {
RangeQuery<O, D> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
final int size = relation.size();
@@ -142,9 +141,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
noise = DBIDUtil.newHashSet();
processedIDs = DBIDUtil.newHashSet(size);
if(size >= minpts) {
- for(DBID id : rangeQuery.getRelation().iterDBIDs()) {
+ for(DBID id : relation.iterDBIDs()) {
if(!processedIDs.contains(id)) {
- expandCluster(database, rangeQuery, id, objprog, clusprog);
+ expandCluster(relation, rangeQuery, id, objprog, clusprog);
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), logger);
@@ -156,7 +155,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
}
else {
- for(DBID id : rangeQuery.getRelation().iterDBIDs()) {
+ for(DBID id : relation.iterDBIDs()) {
noise.add(id);
if(objprog != null && clusprog != null) {
objprog.setProcessed(noise.size(), logger);
@@ -189,12 +188,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
* <p/>
* Border-Objects become members of the first possible cluster.
*
- * @param database the database on which the algorithm is run
+ * @param relation Database relation to run on
* @param rangeQuery Range query to use
* @param startObjectID potential seed of a new potential cluster
* @param objprog the progress object for logging the current status
*/
- protected void expandCluster(Database database, RangeQuery<O, D> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
+ protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
List<DistanceResultPair<D>> seeds = rangeQuery.getRangeForDBID(startObjectID, epsilon);
// startObject is no core-object
@@ -245,7 +244,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
}
- if(processedIDs.size() == rangeQuery.getRelation().size() && noise.size() == 0) {
+ if(processedIDs.size() == relation.size() && noise.size() == 0) {
break;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
index ca401ddc..f1e6c945 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
index c1285659..a70a3f6f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,9 +25,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
import java.util.ArrayList;
import java.util.List;
-import java.util.Random;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.RandomlyGeneratedInitialMeans;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -42,6 +43,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
@@ -58,8 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualCons
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Provides the EM algorithm (clustering by expectation maximization).
@@ -113,6 +114,11 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
*/
public static final OptionID DELTA_ID = OptionID.getOrCreateOptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta");
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
+
private static final double MIN_LOGLIKELIHOOD = -100000;
/**
@@ -121,32 +127,27 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
private double delta;
/**
- * Parameter to specify the random generator seed.
+ * Store the individual probabilities, for use by EMOutlierDetection etc.
*/
- public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("em.seed", "The random number generator seed.");
+ private WritableDataStore<double[]> probClusterIGivenX;
/**
- * Holds the value of {@link #SEED_ID}.
+ * Class to choose the initial means
*/
- private Long seed;
-
- /**
- * Store the individual probabilities, for use by EMOutlierDetection etc.
- */
- private WritableDataStore<double[]> probClusterIGivenX;
+ private KMeansInitialization<V> initializer;
/**
* Constructor.
*
* @param k k parameter
* @param delta delta parameter
- * @param seed Seed parameter
+ * @param initializer Class to choose the initial means
*/
- public EM(int k, double delta, Long seed) {
+ public EM(int k, double delta, KMeansInitialization<V> initializer) {
super();
this.k = k;
this.delta = delta;
- this.seed = seed;
+ this.initializer = initializer;
}
/**
@@ -169,14 +170,14 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
if(logger.isVerbose()) {
logger.verbose("initializing " + k + " models");
}
- List<V> means = initialMeans(relation);
+ List<Vector> means = initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC);
List<Matrix> covarianceMatrices = new ArrayList<Matrix>(k);
List<Double> normDistrFactor = new ArrayList<Double>(k);
List<Matrix> invCovMatr = new ArrayList<Matrix>(k);
List<Double> clusterWeights = new ArrayList<Double>(k);
probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
- int dimensionality = means.get(0).getDimensionality();
+ final int dimensionality = means.get(0).getDimensionality();
for(int i = 0; i < k; i++) {
Matrix m = Matrix.identity(dimensionality, dimensionality);
covarianceMatrices.add(m);
@@ -211,12 +212,12 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
em = emNew;
// recompute models
- List<V> meanSums = new ArrayList<V>(k);
+ List<Vector> meanSums = new ArrayList<Vector>(k);
double[] sumOfClusterProbabilities = new double[k];
for(int i = 0; i < k; i++) {
clusterWeights.set(i, 0.0);
- meanSums.add(means.get(i).nullVector());
+ meanSums.add(new Vector(dimensionality));
covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality));
}
@@ -226,24 +227,23 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
for(int i = 0; i < k; i++) {
sumOfClusterProbabilities[i] += clusterProbabilities[i];
- V summand = relation.get(id).multiplicate(clusterProbabilities[i]);
- V currentMeanSum = meanSums.get(i).plus(summand);
- meanSums.set(i, currentMeanSum);
+ Vector summand = relation.get(id).getColumnVector().timesEquals(clusterProbabilities[i]);
+ meanSums.get(i).plusEquals(summand);
}
}
final int n = relation.size();
for(int i = 0; i < k; i++) {
clusterWeights.set(i, sumOfClusterProbabilities[i] / n);
- V newMean = meanSums.get(i).multiplicate(1 / sumOfClusterProbabilities[i]);
+ Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]);
means.set(i, newMean);
}
// covariance matrices
for(DBID id : relation.iterDBIDs()) {
double[] clusterProbabilities = probClusterIGivenX.get(id);
- V instance = relation.get(id);
+ Vector instance = relation.get(id).getColumnVector();
for(int i = 0; i < k; i++) {
- V difference = instance.minus(means.get(i));
- covarianceMatrices.get(i).plusEquals(difference.getColumnVector().times(difference.getRowVector()).times(clusterProbabilities[i]));
+ Vector difference = instance.minus(means.get(i));
+ covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i]));
}
}
for(int i = 0; i < k; i++) {
@@ -281,13 +281,14 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
}
hardClusters.get(maxIndex).add(id);
}
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
Clustering<EMModel<V>> result = new Clustering<EMModel<V>>("EM Clustering", "em-clustering");
// provide models within the result
for(int i = 0; i < k; i++) {
// TODO: re-do labeling.
// SimpleClassLabel label = new SimpleClassLabel();
// label.init(result.canonicalClusterLabel(i));
- Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(means.get(i), covarianceMatrices.get(i)));
+ Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i)));
result.addCluster(model);
}
return result;
@@ -308,24 +309,20 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
* @param clusterWeights the weights of the current clusters
* @return the expectation value of the current mixture of distributions
*/
- protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<V> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
+ protected double assignProbabilitiesToInstances(Relation<V> database, List<Double> normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, List<Double> clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
double emSum = 0.0;
for(DBID id : database.iterDBIDs()) {
- V x = database.get(id);
+ Vector x = database.get(id).getColumnVector();
List<Double> probabilities = new ArrayList<Double>(k);
for(int i = 0; i < k; i++) {
- V difference = x.minus(means.get(i));
- Matrix differenceRow = difference.getRowVector();
- Vector differenceCol = difference.getColumnVector();
- Matrix rowTimesCov = differenceRow.times(invCovMatr.get(i));
- Vector rowTimesCovTimesCol = rowTimesCov.times(differenceCol);
- double power = rowTimesCovTimesCol.get(0, 0) / 2.0;
+ Vector difference = x.minus(means.get(i));
+ double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference);
+ double power = rowTimesCovTimesCol / 2.0;
double prob = normDistrFactor.get(i) * Math.exp(-power);
if(logger.isDebuggingFinest()) {
- logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " differenceRow:\n" + FormatUtil.format(differenceRow, " ") + "\n" + " differenceCol:\n" + FormatUtil.format(differenceCol, " ") + "\n" + " rowTimesCov:\n" + FormatUtil.format(rowTimesCov, " ") + "\n" + " rowTimesCovTimesCol:\n" + FormatUtil.format(rowTimesCovTimesCol, " ") + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " "));
+ logger.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " "));
}
-
probabilities.add(prob);
}
double priorProbability = 0.0;
@@ -356,48 +353,6 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
}
/**
- * Creates {@link #k k} random points distributed uniformly within the
- * attribute ranges of the given database.
- *
- * @param relation the database must contain enough points in order to
- * ascertain the range of attribute values. Less than two points would
- * make no sense. The content of the database is not touched otherwise.
- * @return a list of {@link #k k} random points distributed uniformly within
- * the attribute ranges of the given database
- */
- protected List<V> initialMeans(Relation<V> relation) {
- final Random random;
- if(this.seed != null) {
- random = new Random(this.seed);
- }
- else {
- random = new Random();
- }
- if(relation.size() > 0) {
- final int dim = DatabaseUtil.dimensionality(relation);
- Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
- List<V> means = new ArrayList<V>(k);
- if(logger.isVerbose()) {
- logger.verbose("initializing random vectors");
- }
- for(int i = 0; i < k; i++) {
- double[] r = MathUtil.randomDoubleArray(dim, random);
- // Rescale
- for (int d = 0; d < dim; d++) {
- r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d];
- }
- // Instantiate
- V randomVector = minmax.first.newInstance(r);
- means.add(randomVector);
- }
- return means;
- }
- else {
- return new ArrayList<V>(0);
- }
- }
-
- /**
* Get the probabilities for a given point.
*
* @param index Point ID
@@ -429,7 +384,7 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
protected double delta;
- protected Long seed;
+ protected KMeansInitialization<V> initializer;
@Override
protected void makeOptions(Parameterization config) {
@@ -439,20 +394,20 @@ public class EM<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clusteri
k = kP.getValue();
}
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, new GreaterEqualConstraint(0.0), 0.0);
if(config.grab(deltaP)) {
delta = deltaP.getValue();
}
-
- LongParameter seedP = new LongParameter(SEED_ID, true);
- if(config.grab(seedP)) {
- seed = seedP.getValue();
- }
}
@Override
protected EM<V> makeInstance() {
- return new EM<V>(k, delta, seed);
+ return new EM<V>(k, delta, initializer);
}
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java
deleted file mode 100644
index 38ea89c2..00000000
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/KMeans.java
+++ /dev/null
@@ -1,307 +0,0 @@
-package de.lmu.ifi.dbs.elki.algorithm.clustering;
-
-/*
- This file is part of ELKI:
- Environment for Developing KDD-Applications Supported by Index-Structures
-
- Copyright (C) 2011
- Ludwig-Maximilians-Universität München
- Lehr- und Forschungseinheit für Datenbanksysteme
- ELKI Development Team
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Random;
-
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
-import de.lmu.ifi.dbs.elki.data.Cluster;
-import de.lmu.ifi.dbs.elki.data.Clustering;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
-import de.lmu.ifi.dbs.elki.data.model.MeanModel;
-import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
-import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
-import de.lmu.ifi.dbs.elki.logging.Logging;
-import de.lmu.ifi.dbs.elki.math.MathUtil;
-import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
-import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
-
-/**
- * Provides the k-means algorithm.
- * <p>
- * Reference: J. MacQueen: Some Methods for Classification and Analysis of
- * Multivariate Observations. <br>
- * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297.
- * </p>
- *
- * @author Arthur Zimek
- *
- * @apiviz.has MeanModel
- *
- * @param <D> a type of {@link Distance} as returned by the used distance
- * function
- * @param <V> a type of {@link NumberVector} as a suitable datatype for this
- * algorithm
- */
-@Title("K-Means")
-@Description("Finds a partitioning into k clusters.")
-@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992")
-public class KMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<V, D, Clustering<MeanModel<V>>> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
- /**
- * The logger for this class.
- */
- private static final Logging logger = Logging.getLogger(KMeans.class);
-
- /**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater than 0.
- */
- public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
-
- /**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater or equal to 0, where 0 means no limit.
- */
- public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
-
- /**
- * Parameter to specify the random generator seed.
- */
- public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
-
- /**
- * Holds the value of {@link #K_ID}.
- */
- private int k;
-
- /**
- * Holds the value of {@link #MAXITER_ID}.
- */
- private int maxiter;
-
- /**
- * Holds the value of {@link #SEED_ID}.
- */
- private Long seed;
-
- /**
- * Constructor.
- *
- * @param distanceFunction distance function
- * @param k k parameter
- * @param maxiter Maxiter parameter
- * @param seed Random generator seed
- */
- public KMeans(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, Long seed) {
- super(distanceFunction);
- this.k = k;
- this.maxiter = maxiter;
- this.seed = seed;
- }
-
- /**
- * Run k-means
- *
- * @param database Database
- * @param relation relation to use
- * @return result
- * @throws IllegalStateException
- */
- public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
- final Random random = (this.seed != null) ? new Random(this.seed) : new Random();
- if(relation.size() > 0) {
- final int dim = DatabaseUtil.dimensionality(relation);
- Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
- List<V> means = new ArrayList<V>(k);
- List<V> oldMeans;
- if(logger.isVerbose()) {
- logger.verbose("initializing random vectors");
- }
- for(int i = 0; i < k; i++) {
- double[] r = MathUtil.randomDoubleArray(dim, random);
- // Rescale
- for (int d = 0; d < dim; d++) {
- r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d];
- }
- // Instantiate
- V randomVector = minmax.first.newInstance(r);
- means.add(randomVector);
- }
- List<? extends ModifiableDBIDs> clusters;
- clusters = sort(means, relation);
- boolean changed = true;
- int iteration = 1;
- while(changed) {
- if(logger.isVerbose()) {
- logger.verbose("iteration " + iteration);
- }
- oldMeans = new ArrayList<V>(means);
- means = means(clusters, means, relation);
- clusters = sort(means, relation);
- changed = !means.equals(oldMeans);
- iteration++;
-
- if(maxiter > 0 && iteration > maxiter) {
- break;
- }
- }
- Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
- for(int i = 0; i < clusters.size(); i++) {
- DBIDs ids = clusters.get(i);
- MeanModel<V> model = new MeanModel<V>(means.get(i));
- result.addCluster(new Cluster<MeanModel<V>>(ids, model));
- }
- return result;
- }
- else {
- return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
- }
- }
-
- /**
- * Returns the mean vectors of the given clusters in the given database.
- *
- * @param clusters the clusters to compute the means
- * @param means the recent means
- * @param database the database containing the vectors
- * @return the mean vectors of the given clusters in the given database
- */
- protected List<V> means(List<? extends ModifiableDBIDs> clusters, List<V> means, Relation<V> database) {
- List<V> newMeans = new ArrayList<V>(k);
- for(int i = 0; i < k; i++) {
- ModifiableDBIDs list = clusters.get(i);
- V mean = null;
- for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) {
- if(mean == null) {
- mean = database.get(clusterIter.next());
- }
- else {
- mean = mean.plus(database.get(clusterIter.next()));
- }
- }
- if(list.size() > 0) {
- assert mean != null;
- mean = mean.multiplicate(1.0 / list.size());
- }
- else {
- mean = means.get(i);
- }
- newMeans.add(mean);
- }
- return newMeans;
- }
-
- /**
- * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
- * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
- *
- * @param means a list of k means
- * @param database the database to cluster
- * @return list of k clusters
- */
- protected List<? extends ModifiableDBIDs> sort(List<V> means, Relation<V> database) {
- List<ArrayModifiableDBIDs> clusters = new ArrayList<ArrayModifiableDBIDs>(k);
- for(int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newArray());
- }
-
- for(DBID id : database.iterDBIDs()) {
- List<D> distances = new ArrayList<D>(k);
- V fv = database.get(id);
- int minIndex = 0;
- for(int d = 0; d < k; d++) {
- distances.add(getDistanceFunction().distance(fv, means.get(d)));
- if(distances.get(d).compareTo(distances.get(minIndex)) < 0) {
- minIndex = d;
- }
- }
- clusters.get(minIndex).add(id);
- }
- for(ArrayModifiableDBIDs cluster : clusters) {
- Collections.sort(cluster);
- }
- return clusters;
- }
-
- @Override
- public TypeInformation[] getInputTypeRestriction() {
- return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
- }
-
- @Override
- protected Logging getLogger() {
- return logger;
- }
-
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
- protected int k;
-
- protected int maxiter;
-
- protected Long seed;
-
- @Override
- protected void makeOptions(Parameterization config) {
- super.makeOptions(config);
- IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
- if(config.grab(kP)) {
- k = kP.getValue();
- }
-
- IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
- if(config.grab(maxiterP)) {
- maxiter = maxiterP.getValue();
- }
-
- LongParameter seedP = new LongParameter(SEED_ID, true);
- if(config.grab(seedP)) {
- seed = seedP.getValue();
- }
- }
-
- @Override
- protected KMeans<V, D> makeInstance() {
- return new KMeans<V, D>(distanceFunction, k, maxiter, seed);
- }
- }
-} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
index 24985e24..2244b07b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java
index c233963d..d6c5872a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
index f7bd10c7..41e48b89 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java
index e1329888..45b12c43 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -133,18 +133,18 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
public Result run(Database database, Relation<O> relation) {
DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass();
- WritableRecordStore store = DataStoreUtil.makeRecordStorage(distQuery.getRelation().getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls);
+ WritableRecordStore store = DataStoreUtil.makeRecordStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls);
pi = store.getStorage(0, DBID.class);
lambda = store.getStorage(1, distCls);
// Temporary storage for m.
- WritableDataStore<D> m = DataStoreUtil.makeStorage(distQuery.getRelation().getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls);
+ WritableDataStore<D> m = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls);
- FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", distQuery.getRelation().size(), logger) : null;
+ FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", relation.size(), logger) : null;
// has to be an array for monotonicity reasons!
- ModifiableDBIDs processedIDs = DBIDUtil.newArray(distQuery.getRelation().size());
+ ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size());
// apply the algorithm
- for(DBID id : distQuery.getRelation().iterDBIDs()) {
+ for(DBID id : relation.iterDBIDs()) {
step1(id);
step2(id, processedIDs, distQuery, m);
step3(id, processedIDs, m);
@@ -168,8 +168,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
BasicResult result = null;
// Build clusters identified by their target object
- int minc = minclusters != null ? minclusters : distQuery.getRelation().size();
- result = extractClusters(distQuery.getRelation().getDBIDs(), pi, lambda, minc);
+ int minc = minclusters != null ? minclusters : relation.size();
+ result = extractClusters(relation.getDBIDs(), pi, lambda, minc);
result.addChildResult(new MaterializedRelation<DBID>("SLINK pi", "slink-order", TypeUtil.DBID, pi, processedIDs));
result.addChildResult(new MaterializedRelation<D>("SLINK lambda", "slink-order", new SimpleTypeInformation<D>(distCls), lambda, processedIDs));
@@ -288,7 +288,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
D stopdist = null;
// sort by lambda
ArrayModifiableDBIDs order = DBIDUtil.newArray(ids);
- Collections.sort(order, new CompareByLambda<D>(lambda));
+ order.sort(new CompareByLambda<D>(lambda));
int index = ids.size() - minclusters - 1;
while(index >= 0) {
if(lambda.get(order.get(index)).equals(lambda.get(order.get(index + 1)))) {
@@ -458,7 +458,7 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori
// extract a hierarchical clustering
ArrayModifiableDBIDs order = DBIDUtil.newArray(ids);
// sort by lambda
- Collections.sort(order, new CompareByLambda<D>(lambda));
+ order.sort(new CompareByLambda<D>(lambda));
D curdist = null;
D stopdist = null;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
index 3bde2932..7c3a13c9 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -25,7 +25,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering;
import java.util.ArrayList;
import java.util.Iterator;
-import java.util.LinkedList;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
@@ -36,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -200,8 +200,8 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
* @return the shared nearest neighbors of the specified query object in the
* given database
*/
- protected List<DBID> findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) {
- List<DBID> neighbors = new LinkedList<DBID>();
+ protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) {
+ ArrayModifiableDBIDs neighbors = DBIDUtil.newArray();
for(DBID id : snnInstance.getRelation().iterDBIDs()) {
if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) {
neighbors.add(id);
@@ -222,7 +222,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
* clustering
*/
protected void expandCluster(SimilarityQuery<O, IntegerDistance> snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
- List<DBID> seeds = findSNNNeighbors(snnInstance, startObjectID);
+ ArrayModifiableDBIDs seeds = findSNNNeighbors(snnInstance, startObjectID);
// startObject is no core-object
if(seeds.size() < minpts) {
@@ -247,11 +247,10 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
noise.remove(seed);
}
}
- seeds.remove(0);
while(seeds.size() > 0) {
- DBID o = seeds.remove(0);
- List<DBID> neighborhood = findSNNNeighbors(snnInstance, o);
+ DBID o = seeds.remove(seeds.size() - 1);
+ ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o);
if(neighborhood.size() >= minpts) {
for(DBID p : neighborhood) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
index 4a0b391c..b877415e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.datasource.filter.NonNumericFeaturesException;
+import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -84,7 +84,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
/**
* Provides the CASH algorithm, an subspace clustering algorithm based on the
- * hough transform.
+ * Hough transform.
+ *
+ * <b>Note:</b> CASH requires explicitly setting the input parser other than default to
+ * {@link de.lmu.ifi.dbs.elki.datasource.parser.ParameterizationFunctionLabelParser}:
+ * (in the MiniGui, set option: dbc.parser ParameterizationFunctionLabelParser).
+ *
* <p>
* Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust
* clustering in arbitrarily oriented subspaces. <br>
@@ -99,7 +104,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
*/
// todo elke hierarchy (later)
@Title("CASH: Robust clustering in arbitrarily oriented subspaces")
-@Description("Subspace clustering algorithm based on the hough transform.")
+@Description("Subspace clustering algorithm based on the Hough transform.")
@Reference(authors = "E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek", title = "Robust clustering in arbitraily oriented subspaces", booktitle = "Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008", url = "http://www.siam.org/proceedings/datamining/2008/dm08_69_AchtertBoehmDavidKroegerZimek.pdf")
public class CASH extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
/**
@@ -349,7 +354,7 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
res.addCluster(c);
noiseIDs.removeDBIDs(interval.getIDs());
clusterIDs.addDBIDs(interval.getIDs());
- processedIDs.addAll(interval.getIDs());
+ processedIDs.addDBIDs(interval.getIDs());
}
// Rebuild heap
@@ -372,13 +377,13 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
if(dim == noiseDim) {
Cluster<Model> c = new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER);
res.addCluster(c);
- processedIDs.addAll(noiseIDs);
+ processedIDs.addDBIDs(noiseIDs);
}
else if(noiseIDs.size() >= minPts) {
LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
Cluster<Model> c = new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les));
res.addCluster(c);
- processedIDs.addAll(noiseIDs);
+ processedIDs.addDBIDs(noiseIDs);
}
}
@@ -521,7 +526,7 @@ public class CASH extends AbstractAlgorithm<Clustering<Model>> implements Cluste
private ParameterizationFunction project(Matrix basis, ParameterizationFunction f) {
// Matrix m = new Matrix(new
// double[][]{f.getPointCoordinates()}).times(basis);
- Matrix m = f.getRowVector().times(basis);
+ Matrix m = f.getColumnVector().transposeTimes(basis);
ParameterizationFunction f_t = new ParameterizationFunction(m.getColumnPackedCopy());
return f_t;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
index 8fc30b3d..575bf117 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
index 75633853..af4f677f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -244,7 +244,7 @@ public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Cluste
}
else {
ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
- merged.addAll(clus.getIDs().asCollection());
+ merged.addDBIDs(clus.getIDs());
noise.setIDs(merged);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java
index 93d0cc99..98761962 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
index 92723428..1065682c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
new file mode 100644
index 00000000..41ee1f69
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
@@ -0,0 +1,566 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.Model;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.math.histograms.FlexiHistogram;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair;
+
+/**
+ * Linear manifold clustering in high dimensional spaces by stochastic search.
+ *
+ * Reference:
+ * <p>
+ * Robert Haralick, Rave Harpaz<br />
+ * Linear manifold clustering in high dimensional spaces by stochastic search<br/>
+ * In: Pattern Recognition volume 40, Issue 10
+ * </p>
+ *
+ * Implementation note: the LMCLUS algorithm seems to lack good stopping
+ * criterions. We can't entirely reproduce the good results from the original
+ * publication, in particular not on noisy data. But the questionable parts are
+ * as in the original publication, associated thesis and published source code.
+ * The minimum cluster size however can serve as a hidden stopping criterion.
+ *
+ * @author Ernst Waas
+ * @author Erich Schubert
+ */
+@Reference(authors = "Robert Haralick, Rave Harpaz", title = "Linear manifold clustering in high dimensional spaces by stochastic search", booktitle = "Pattern Recognition volume 40, Issue 10", url = "http://dx.doi.org/10.1016/j.patcog.2007.01.020")
+public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(LMCLUS.class);
+
+ /**
+ * Epsilon
+ */
+ private final static double NOT_FROM_ONE_CLUSTER_PROBABILITY = 0.2;
+
+ /**
+ * Histogram resolution
+ */
+ private final static int BINS = 50;
+
+ /**
+ * The current threshold value calculated by the findSeperation Method.
+ */
+ private final double sensitivityThreshold;
+
+ /**
+ * Maximum cluster dimensionality
+ */
+ private final int maxLMDim;
+
+ /**
+ * Minimum cluster size
+ */
+ private final int minsize;
+
+ /**
+ * Number of sampling rounds to find a good split
+ */
+ private final int samplingLevel;
+
+ /**
+ * Constructor.
+ *
+ * @param maxdim Maximum dimensionality
+ * @param minsize Minimum cluster size
+ * @param samplingLevel Sampling level
+ * @param sensitivityThreshold Threshold
+ */
+ public LMCLUS(int maxdim, int minsize, int samplingLevel, double sensitivityThreshold) {
+ super();
+ this.maxLMDim = maxdim;
+ this.minsize = minsize;
+ this.samplingLevel = samplingLevel;
+ this.sensitivityThreshold = sensitivityThreshold;
+ }
+
+ /**
+ * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
+ * method.
+ *
+ * <PRE>
+ * The algorithm samples random linear manifolds and tries to find clusters in it.
+ * It calculates a distance histogram searches for a threshold and partitions the
+ * points in two groups the ones in the cluster and everything else.
+ * Then the best fitting linear manifold is searched and registered as a cluster.
+ * The process is started over until all points are clustered.
+ * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
+ * For details see {@link LMCLUS}.
+ * </PRE>
+ *
+ * @param database The database to operate on
+ * @param relation Relation
+ * @return Clustering result
+ * @throws de.lmu.ifi.dbs.elki.utilities.UnableToComplyException
+ */
+ public Clustering<Model> run(Database database, Relation<NumberVector<?, ?>> relation) throws UnableToComplyException {
+ Clustering<Model> ret = new Clustering<Model>("LMCLUS Clustering", "lmclus-clustering");
+ FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), logger) : null;
+ IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters found", logger) : null;
+ ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
+
+ final int maxdim = Math.min(maxLMDim, DatabaseUtil.dimensionality(relation));
+ int cnum = 0;
+ while(unclustered.size() > minsize) {
+ DBIDs current = unclustered;
+ int lmDim = 1;
+ for(int k = 1; k <= maxdim; k++) {
+ // Implementation note: this while loop is from the original publication
+ // and the published LMCLUS source code. It doesn't make sense to me -
+ // it is lacking a stop criterion other than "cluster is too small" and
+ // "cluster is inseparable"! Additionally, there is good criterion for
+ // stopping at the appropriate dimensionality either.
+ while(true) {
+ Separation separation = findSeparation(relation, current, k);
+ // logger.verbose("k: " + k + " goodness: " + separation.goodness +
+ // " threshold: " + separation.threshold);
+ if(separation.goodness <= sensitivityThreshold) {
+ break;
+ }
+ ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
+ for(DBID id : current) {
+ if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
+ subset.add(id);
+ }
+ }
+ // logger.verbose("size:"+subset.size());
+ if(subset.size() < minsize) {
+ break;
+ }
+ current = subset;
+ lmDim = k;
+ // System.out.println("Partition: " + subset.size());
+ }
+ }
+ // No more clusters found
+ if(current.size() < minsize || current == unclustered) {
+ break;
+ }
+ // New cluster found
+ // TODO: annotate cluster with dimensionality
+ final Cluster<Model> cluster = new Cluster<Model>(current);
+ cluster.setName("Cluster_" + lmDim + "d_" + cnum);
+ cnum++;
+ ret.addCluster(cluster);
+ // Remove from main working set.
+ unclustered.removeDBIDs(current);
+ if(progress != null) {
+ progress.setProcessed(relation.size() - unclustered.size(), logger);
+ }
+ if(cprogress != null) {
+ cprogress.setProcessed(cnum, logger);
+ }
+ }
+ // Remaining objects are noise
+ if(unclustered.size() > 0) {
+ ret.addCluster(new Cluster<Model>(unclustered, true));
+ }
+ if(progress != null) {
+ progress.setProcessed(relation.size(), logger);
+ progress.ensureCompleted(logger);
+ }
+ if(cprogress != null) {
+ cprogress.setCompleted(logger);
+ }
+ return ret;
+ }
+
+ /**
+ * Deviation from a manifold described by beta.
+ *
+ * @param delta Delta from origin vector
+ * @param beta Manifold
+ * @return Deviation score
+ */
+ private double deviation(Vector delta, Matrix beta) {
+ double a = delta.euclideanLength();
+ double b = beta.transposeTimes(delta).euclideanLength();
+ return Math.sqrt((a * a) - (b * b));
+ }
+
+ /**
+ * This method samples a number of linear manifolds an tries to determine
+ * which the one with the best cluster is.
+ *
+ * <PRE>
+ * A number of sample points according to the dimension of the linear manifold are taken.
+ * The basis (B) and the origin(o) of the manifold are calculated.
+ * A distance histogram using the distance function ||x-o|| -||B^t*(x-o)|| is generated.
+ * The best threshold is searched using the elevate threshold function.
+ * The overall goodness of the threshold is determined.
+ * The process is redone until a specific number of samples is taken.
+ * </PRE>
+ *
+ * @param relation The vector relation
+ * @param currentids Current DBIDs
+ * @param dimension the dimension of the linear manifold to sample.
+ * @return the overall goodness of the separation. The values origin basis and
+ * threshold are returned indirectly over class variables.
+ */
+ private Separation findSeparation(Relation<NumberVector<?, ?>> relation, DBIDs currentids, int dimension) {
+ Separation separation = new Separation();
+ // determine the number of samples needed, to secure that with a specific
+ // probability
+ // in at least on sample every sampled point is from the same cluster.
+ int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size());
+ // System.out.println("Number of samples: " + samples);
+ Random r = new Random();
+ int remaining_retries = 100;
+ for(int i = 1; i <= samples; i++) {
+ DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong());
+ final Iterator<DBID> iter = sample.iterator();
+ // Use first as origin
+ DBID origin = iter.next();
+ Vector originV = relation.get(origin).getColumnVector();
+ // Build orthogonal basis from remainder
+ Matrix basis;
+ {
+ List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1);
+ while(iter.hasNext()) {
+ Vector vec = relation.get(iter.next()).getColumnVector();
+ vectors.add(vec.minusEquals(originV));
+ }
+ // generate orthogonal basis
+ basis = generateOrthonormalBasis(vectors);
+ if(basis == null) {
+ // new sample has to be taken.
+ i--;
+ remaining_retries--;
+ if(remaining_retries < 0) {
+ throw new AbortException("Too many retries in sampling, and always a linear dependant data set.");
+ }
+ continue;
+ }
+ }
+ // Generate and fill a histogram.
+ FlexiHistogram<Double, Double> histogram = FlexiHistogram.DoubleSumHistogram(BINS);
+ double w = 1.0 / currentids.size();
+ for(DBID point : currentids) {
+ // Skip sampled points
+ if(sample.contains(point)) {
+ continue;
+ }
+ Vector vec = relation.get(point).getColumnVector().minusEquals(originV);
+ final double distance = deviation(vec, basis);
+ histogram.aggregate(distance, w);
+ }
+ double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold
+ if(th[1] > separation.goodness) {
+ separation.goodness = th[1];
+ separation.threshold = th[0];
+ separation.originV = originV;
+ separation.basis = basis;
+ }
+ }
+ return separation;
+ }
+
+ /**
+ * This Method generates an orthonormal basis from a set of Vectors. It uses
+ * the established Gram-Schmidt algorithm for orthonormalisation:
+ *
+ * <PRE>
+ * u_1 = v_1
+ * u_k = v_k -proj_u1(v_k)...proj_u(k-1)(v_k);
+ *
+ * Where proj_u(v) = <v,u>/<u,u> *u
+ * </PRE>
+ *
+ * @param vectors The set of vectors to generate the orthonormal basis from
+ * @return the orthonormal basis generated by this method.
+ * @throws RuntimeException if the given vectors are not linear independent.
+ */
+ private Matrix generateOrthonormalBasis(List<Vector> vectors) {
+ Vector first = vectors.get(0);
+ first = first.times(1.0 / first.euclideanLength());
+ Matrix ret = new Matrix(first.getDimensionality(), vectors.size());
+ ret.setCol(0, first);
+ for(int i = 1; i < vectors.size(); i++) {
+ // System.out.println("Matrix:" + ret);
+ Vector v_i = vectors.get(i);
+ Vector u_i = v_i.copy();
+ // System.out.println("Vector " + i + ":" + partialSol);
+ for(int j = 0; j < i; j++) {
+ Vector v_j = ret.getCol(j);
+ double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j);
+ if(Double.isNaN(f)) {
+ if(logger.isDebuggingFine()) {
+ logger.debugFine("Zero vector encountered? " + v_j);
+ }
+ return null;
+ }
+ u_i.minusTimesEquals(v_j, f);
+ }
+ // check if the vectors weren't independent
+ final double len_u_i = u_i.euclideanLength();
+ if(len_u_i == 0.0) {
+ if(logger.isDebuggingFine()) {
+ logger.debugFine("Points not independent - no orthonormalization.");
+ }
+ return null;
+ }
+ // System.out.println("Vector " + i + ":" + partialSol);
+ u_i.timesEquals(1 / len_u_i);
+ ret.setCol(i, u_i);
+ }
+ return ret;
+ }
+
+ /**
+ * Evaluate the histogram to find a suitable threshold
+ *
+ * @param histogram Histogram to evaluate
+ * @return Position and goodness
+ */
+ private double[] findAndEvaluateThreshold(FlexiHistogram<Double, Double> histogram) {
+ int n = histogram.getNumBins();
+ double[] p1 = new double[n];
+ double[] p2 = new double[n];
+ double[] mu1 = new double[n];
+ double[] mu2 = new double[n];
+ double[] sigma1 = new double[n];
+ double[] sigma2 = new double[n];
+ double[] jt = new double[n];
+ // Forward pass
+ {
+ MeanVariance mv = new MeanVariance();
+ Iterator<DoubleObjPair<Double>> forward = histogram.iterator();
+ for(int i = 0; forward.hasNext(); i++) {
+ DoubleObjPair<Double> pair = forward.next();
+ p1[i] = pair.second + ((i > 0) ? p1[i - 1] : 0);
+ mv.put(i, pair.second);
+ mu1[i] = mv.getMean();
+ sigma1[i] = mv.getNaiveStddev();
+ }
+ }
+ // Backwards pass
+ {
+ MeanVariance mv = new MeanVariance();
+ Iterator<DoubleObjPair<Double>> backwards = histogram.reverseIterator();
+ for(int j = n - 1; backwards.hasNext(); j--) {
+ DoubleObjPair<Double> pair = backwards.next();
+ p2[j] = pair.second + ((j + 1 < n) ? p2[j + 1] : 0);
+ mv.put(j, pair.second);
+ mu2[j] = mv.getMean();
+ sigma2[j] = mv.getNaiveStddev();
+ }
+ }
+
+ for(int i = 0; i < n; i++) {
+ jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i])));
+ }
+
+ int bestpos = -1;
+ double bestgoodness = Double.NEGATIVE_INFINITY;
+
+ double devPrev = jt[1] - jt[0];
+ for(int i = 1; i < jt.length - 1; i++) {
+ double devCur = jt[i + 1] - jt[i];
+ // System.out.println(p1[i]);
+ // System.out.println(jt[i + 1]);
+ // System.out.println(jt[i]);
+ // System.out.println(devCur);
+ // Local minimum found - calculate depth
+ if(devCur >= 0 && devPrev <= 0) {
+ double lowestMaxima = Double.POSITIVE_INFINITY;
+ for(int j = i - 1; j > 0; j--) {
+ if(jt[j - 1] < jt[j]) {
+ lowestMaxima = Math.min(lowestMaxima, jt[j]);
+ break;
+ }
+ }
+ for(int j = i + 1; j < n - 2; j++) {
+ if(jt[j + 1] < jt[j]) {
+ lowestMaxima = Math.min(lowestMaxima, jt[j]);
+ break;
+ }
+ }
+ double localDepth = lowestMaxima - jt[i];
+
+ final double mud = mu1[i] - mu2[i];
+ double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]);
+ if(Double.isNaN(discriminability)) {
+ discriminability = -1;
+ }
+ double goodness = localDepth * discriminability;
+ if(goodness > bestgoodness) {
+ bestgoodness = goodness;
+ bestpos = i;
+ }
+ }
+ devPrev = devCur;
+ }
+ return new double[] { histogram.getBinMax(bestpos), bestgoodness };
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ /**
+ * Class to represent a linear manifold separation
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class Separation {
+ /**
+ * Goodness of separation
+ */
+ double goodness = Double.NEGATIVE_INFINITY;
+
+ /**
+ * Threshold
+ */
+ double threshold = Double.NEGATIVE_INFINITY;
+
+ /**
+ * Basis of manifold
+ */
+ Matrix basis = null;
+
+ /**
+ * Origin vector
+ */
+ Vector originV = null;
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Parameter with the maximum dimension to search for
+ */
+ public static final OptionID MAXDIM_ID = OptionID.getOrCreateOptionID("lmclus.maxdim", "Maximum linear manifold dimension to search.");
+
+ /**
+ * Parameter for the minimum cluster size
+ */
+ public static final OptionID MINSIZE_ID = OptionID.getOrCreateOptionID("lmclus.minsize", "Minimum cluster size to allow.");
+
+ /**
+ * Sampling intensity level
+ */
+ public static final OptionID SAMPLINGL_ID = OptionID.getOrCreateOptionID("lmclus.sampling-level", "A number used to determine how many samples are taken in each search.");
+
+ /**
+ * Global significance threshold
+ */
+ public static final OptionID THRESHOLD_ID = OptionID.getOrCreateOptionID("lmclus.threshold", "Threshold to determine if a cluster was found.");
+
+ /**
+ * Maximum dimensionality to search for
+ */
+ private int maxdim = Integer.MAX_VALUE;
+
+ /**
+ * Minimum cluster size.
+ */
+ private int minsize;
+
+ /**
+ * Sampling level
+ */
+ private int samplingLevel;
+
+ /**
+ * Threshold
+ */
+ private double threshold;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter maxLMDimP = new IntParameter(MAXDIM_ID, new GreaterEqualConstraint(1), true);
+ if(config.grab(maxLMDimP)) {
+ maxdim = maxLMDimP.getValue();
+ }
+ IntParameter minsizeP = new IntParameter(MINSIZE_ID, new GreaterEqualConstraint(1));
+ if(config.grab(minsizeP)) {
+ minsize = minsizeP.getValue();
+ }
+ IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100);
+ if(config.grab(samplingLevelP)) {
+ samplingLevel = samplingLevelP.getValue();
+ }
+ DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID);
+ if(config.grab(sensivityThresholdP)) {
+ threshold = sensivityThresholdP.getValue();
+ }
+ }
+
+ @Override
+ protected LMCLUS makeInstance() {
+ return new LMCLUS(maxdim, minsize, samplingLevel, threshold);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
index 924e1786..eb5608fc 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -50,6 +50,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.SortedEigenPairs;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAResult;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCARunner;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
@@ -420,14 +421,14 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
/**
* Returns the union of the two specified clusters.
*
- * @param database the database holding the objects
+ * @param relation the database holding the objects
* @param distFunc the distance function
* @param c1 the first cluster
* @param c2 the second cluster
* @param dim the dimensionality of the union cluster
* @return the union of the two specified clusters
*/
- private ORCLUSCluster union(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster c1, ORCLUSCluster c2, int dim) {
+ private ORCLUSCluster union(Relation<V> relation, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster c1, ORCLUSCluster c2, int dim) {
ORCLUSCluster c = new ORCLUSCluster();
c.objectIDs = DBIDUtil.newHashSet(c1.objectIDs);
@@ -436,11 +437,13 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
c.objectIDs = DBIDUtil.newArray(c.objectIDs);
if(c.objectIDs.size() > 0) {
- c.centroid = DatabaseUtil.centroid(database, c.objectIDs);
- c.basis = findBasis(database, distFunc, c, dim);
+ c.centroid = DatabaseUtil.centroid(relation, c.objectIDs);
+ c.basis = findBasis(relation, distFunc, c, dim);
}
else {
- c.centroid = c1.centroid.plus(c2.centroid).multiplicate(0.5);
+ V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5);
+ c.centroid = factory.newNumberVector(cent.getArrayRef());
double[][] doubles = new double[c1.basis.getRowDimensionality()][dim];
for(int i = 0; i < dim; i++) {
doubles[i][i] = 1;
@@ -460,9 +463,9 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
* @return the projection of double vector o in the subspace of cluster c
*/
private V projection(ORCLUSCluster c, V o, V factory) {
- Matrix o_proj = o.getRowVector().times(c.basis);
+ Matrix o_proj = o.getColumnVector().transposeTimes(c.basis);
double[] values = o_proj.getColumnPackedCopy();
- return factory.newInstance(values);
+ return factory.newNumberVector(values);
}
@Override
@@ -523,7 +526,7 @@ public class ORCLUS<V extends NumberVector<V, ?>> extends AbstractProjectedClust
for(int d = 1; d <= o.getDimensionality(); d++) {
values[d - 1] = o.doubleValue(d);
}
- this.centroid = factory.newInstance(values);
+ this.centroid = factory.newNumberVector(values);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java
index 6c5db740..46112498 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java
index 62ff658f..86e045cb 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java
index 8cd156e8..8b6d104c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java
@@ -7,7 +7,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java
index 82a1f1e1..665de632 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java
@@ -7,7 +7,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
new file mode 100644
index 00000000..d3c73b53
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -0,0 +1,310 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Abstract base class for k-means implementations.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+public abstract class AbstractKMeans<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?, ?>, D, Clustering<MeanModel<V>>> {
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = OptionID.getOrCreateOptionID("kmeans.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater or equal to 0, where 0 means no limit.
+ */
+ public static final OptionID MAXITER_ID = OptionID.getOrCreateOptionID("kmeans.maxiter", "The maximum number of iterations to do. 0 means no limit.");
+
+ /**
+ * Parameter to specify the random generator seed.
+ */
+ public static final OptionID SEED_ID = OptionID.getOrCreateOptionID("kmeans.seed", "The random number generator seed.");
+
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = OptionID.getOrCreateOptionID("kmeans.initialization", "Method to choose the initial means.");
+
+ /**
+ * Holds the value of {@link #K_ID}.
+ */
+ protected int k;
+
+ /**
+ * Holds the value of {@link #MAXITER_ID}.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMeansInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public AbstractKMeans(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param relation the database to cluster
+ * @param means a list of k means
+ * @param clusters cluster assignment
+ * @return true when the object was reassigned
+ */
+ protected boolean assignToNearestCluster(Relation<V> relation, List<Vector> means, List<? extends ModifiableDBIDs> clusters) {
+ boolean changed = false;
+
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
+ for(DBID id : relation.iterDBIDs()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.doubleDistance(fv, means.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ if(clusters.get(minIndex).add(id)) {
+ changed = true;
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(id)) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ else {
+ final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
+ for(DBID id : relation.iterDBIDs()) {
+ D mindist = df.getDistanceFactory().infiniteDistance();
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ D dist = df.distance(fv, means.get(i));
+ if(dist.compareTo(mindist) < 0) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ if(clusters.get(minIndex).add(id)) {
+ changed = true;
+ // Remove from previous cluster
+ // TODO: keep a list of cluster assignments to save this search?
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(id)) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ return changed;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ /**
+ * Returns the mean vectors of the given clusters in the given database.
+ *
+ * @param clusters the clusters to compute the means
+ * @param means the recent means
+ * @param database the database containing the vectors
+ * @return the mean vectors of the given clusters in the given database
+ */
+ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<Vector> means, Relation<V> database) {
+ List<Vector> newMeans = new ArrayList<Vector>(k);
+ for(int i = 0; i < k; i++) {
+ ModifiableDBIDs list = clusters.get(i);
+ Vector mean = null;
+ for(Iterator<DBID> clusterIter = list.iterator(); clusterIter.hasNext();) {
+ if(mean == null) {
+ mean = database.get(clusterIter.next()).getColumnVector();
+ }
+ else {
+ mean.plusEquals(database.get(clusterIter.next()).getColumnVector());
+ }
+ }
+ if(list.size() > 0) {
+ assert mean != null;
+ mean.timesEquals(1.0 / list.size());
+ }
+ else {
+ mean = means.get(i);
+ }
+ newMeans.add(mean);
+ }
+ return newMeans;
+ }
+
+ /**
+ * Compute an incremental update for the mean
+ *
+ * @param mean Mean to update
+ * @param vec Object vector
+ * @param newsize (New) size of cluster
+ * @param op Cluster size change / Weight change
+ */
+ protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) {
+ if(newsize == 0) {
+ return; // Keep old mean
+ }
+ Vector delta = vec.getColumnVector();
+ // Compute difference from mean
+ delta.minusEquals(mean);
+ delta.timesEquals(op / newsize);
+ mean.plusEquals(delta);
+ }
+
+ /**
+ * Perform a MacQueen style iteration.
+ *
+ * @param relation Relation
+ * @param means Means
+ * @param clusters Clusters
+ * @return true when the means have changed
+ */
+ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
+ boolean changed = false;
+
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ // Raw distance function
+ @SuppressWarnings("unchecked")
+ final PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?, ?>>) getDistanceFunction();
+
+ // Incremental update
+ for(DBID id : relation.iterDBIDs()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ double dist = df.doubleDistance(fv, means.get(i));
+ if(dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ // Update the cluster mean incrementally:
+ for(int i = 0; i < k; i++) {
+ ModifiableDBIDs ci = clusters.get(i);
+ if(i == minIndex) {
+ if(ci.add(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ changed = true;
+ }
+ }
+ else if(ci.remove(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ changed = true;
+ }
+ }
+ }
+ }
+ else {
+ // Raw distance function
+ final PrimitiveDistanceFunction<? super NumberVector<?, ?>, D> df = getDistanceFunction();
+
+ // Incremental update
+ for(DBID id : relation.iterDBIDs()) {
+ D mindist = df.getDistanceFactory().infiniteDistance();
+ V fv = relation.get(id);
+ int minIndex = 0;
+ for(int i = 0; i < k; i++) {
+ D dist = df.distance(fv, means.get(i));
+ if(dist.compareTo(mindist) < 0) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ // Update the cluster mean incrementally:
+ for(int i = 0; i < k; i++) {
+ ModifiableDBIDs ci = clusters.get(i);
+ if(i == minIndex) {
+ if(ci.add(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size(), +1);
+ changed = true;
+ }
+ }
+ else if(ci.remove(id)) {
+ incrementalUpdateMean(means.get(i), relation.get(id), ci.size() + 1, -1);
+ changed = true;
+ }
+ }
+ }
+ }
+ return changed;
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
new file mode 100644
index 00000000..b5f088fb
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java
@@ -0,0 +1,71 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.LongParameter;
+
+/**
+ * Abstract base class for common k-means initializations.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public abstract class AbstractKMeansInitialization<V extends NumberVector<V, ?>> implements KMeansInitialization<V> {
+ /**
+ * Holds the value of {@link KMeansLloyd#SEED_ID}.
+ */
+ protected Long seed;
+
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public AbstractKMeansInitialization(Long seed) {
+ this.seed = seed;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public abstract static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ protected Long seed;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ LongParameter seedP = new LongParameter(AbstractKMeans.SEED_ID, true);
+ if(config.grab(seedP)) {
+ seed = seedP.getValue();
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
new file mode 100644
index 00000000..78ccd426
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java
@@ -0,0 +1,74 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Initialize K-means by using the first k objects as initial means.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class FirstKInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ */
+ public FirstKInitialMeans() {
+ super(null);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ Iterator<DBID> iter = relation.iterDBIDs();
+ List<Vector> means = new ArrayList<Vector>(k);
+ for(int i = 0; i < k && iter.hasNext(); i++) {
+ means.add(relation.get(iter.next()).getColumnVector());
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
+ @Override
+ protected FirstKInitialMeans<V> makeInstance() {
+ return new FirstKInitialMeans<V>();
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
new file mode 100644
index 00000000..f4c0d9c7
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java
@@ -0,0 +1,49 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Interface for initializing K-Means
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public interface KMeansInitialization<V extends NumberVector<V, ?>> {
+ /**
+ * Choose initial means
+ *
+ * @param relation Relation
+ * @param k Parameter k
+ * @param distanceFunction Distance function
+ * @return List of chosen means for k-means
+ */
+ public abstract List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction);
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
new file mode 100644
index 00000000..fda1d6c0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -0,0 +1,176 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-means algorithm, using Lloyd-style bulk iterations.
+ *
+ * <p>
+ * Reference:<br />
+ * S. Lloyd<br/>
+ * Least squares quantization in PCM<br/>
+ * IEEE Transactions on Information Theory 28 (2)<br/>
+ * previously published as Bell Telephone Laboratories Paper
+ * </p>
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+@Title("K-Means")
+@Description("Finds a partitioning into k clusters.")
+@Reference(authors = "S. Lloyd", title = "Least squares quantization in PCM", booktitle = "IEEE Transactions on Information Theory 28 (2): 129–137.", url = "http://dx.doi.org/10.1109/TIT.1982.1056489")
+public class KMeansLloyd<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMeansLloyd.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public KMeansLloyd(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Run k-means
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ * @throws IllegalStateException
+ */
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ if(relation.size() <= 0) {
+ return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if(logger.isVerbose()) {
+ logger.verbose("K-Means iteration " + (iteration + 1));
+ }
+ boolean changed = assignToNearestCluster(relation, means, clusters);
+ // Stop if no cluster assignment changed.
+ if(!changed) {
+ break;
+ }
+ // Recompute means.
+ means = means(clusters, means, relation);
+ }
+ // Wrap result
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
+ result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected AbstractKMeans<V, D> makeInstance() {
+ return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
new file mode 100644
index 00000000..56492dd0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -0,0 +1,177 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.MeanModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Provides the k-means algorithm, using MacQueen style incremental updates.
+ *
+ * <p>
+ * Reference:<br />
+ * J. MacQueen: Some Methods for Classification and Analysis of Multivariate
+ * Observations. <br />
+ * In 5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297.
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MeanModel
+ *
+ * @param <V> vector type to use
+ * @param <D> distance function value type
+ */
+@Title("K-Means")
+@Description("Finds a partitioning into k clusters.")
+@Reference(authors = "J. MacQueen", title = "Some Methods for Classification and Analysis of Multivariate Observations", booktitle = "5th Berkeley Symp. Math. Statist. Prob., Vol. 1, 1967, pp 281-297", url = "http://projecteuclid.org/euclid.bsmsp/1200512992")
+public class KMeansMacQueen<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractKMeans<V, D> implements ClusteringAlgorithm<Clustering<MeanModel<V>>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(KMeansMacQueen.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ */
+ public KMeansMacQueen(PrimitiveDistanceFunction<NumberVector<?, ?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ /**
+ * Run k-means
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ * @throws IllegalStateException
+ */
+ public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
+ if(relation.size() <= 0) {
+ return new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ List<Vector> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction());
+ // Initialize cluster and assign objects
+ List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>();
+ for(int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ }
+ assignToNearestCluster(relation, means, clusters);
+ // Initial recomputation of the means.
+ means = means(clusters, means, relation);
+
+ // Refine result
+ for(int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if(logger.isVerbose()) {
+ logger.verbose("K-Means iteration " + (iteration + 1));
+ }
+ boolean changed = macQueenIterate(relation, means, clusters);
+ if(!changed) {
+ break;
+ }
+ }
+ final V factory = DatabaseUtil.assumeVectorField(relation).getFactory();
+ Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Means Clustering", "kmeans-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ DBIDs ids = clusters.get(i);
+ MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(means.get(i).getArrayRef()));
+ result.addCluster(new Cluster<MeanModel<V>>(ids, model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?, ?>, D> {
+ protected int k;
+
+ protected int maxiter;
+
+ protected KMeansInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID, new GreaterConstraint(0));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+
+ ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(MAXITER_ID, new GreaterEqualConstraint(0), 0);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.getValue();
+ }
+ }
+
+ @Override
+ protected AbstractKMeans<V, D> makeInstance() {
+ return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
new file mode 100644
index 00000000..c7a2fa1d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -0,0 +1,213 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.LoggingUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+
+/**
+ * K-Means++ initialization for k-means.
+ *
+ * Reference:
+ * <p>
+ * D. Arthur, S. Vassilvitskii<br />
+ * k-means++: the advantages of careful seeding<br />
+ * In: Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms,
+ * SODA 2007
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ * @param <D> Distance type
+ */
+@Reference(authors = "D. Arthur, S. Vassilvitskii", title = "k-means++: the advantages of careful seeding", booktitle = "Proc. of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2007", url = "http://dx.doi.org/10.1145/1283383.1283494")
+public class KMeansPlusPlusInitialMeans<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public KMeansPlusPlusInitialMeans(Long seed) {
+ super(seed);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ // Get a distance query
+ if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ throw new AbortException("K-Means++ initialization can only be used with numerical distances.");
+ }
+ @SuppressWarnings("unchecked")
+ final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction;
+ DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF);
+
+ // Chose first mean
+ List<Vector> means = new ArrayList<Vector>(k);
+
+ Random random = (seed != null) ? new Random(seed) : new Random();
+ DBID first = DBIDUtil.randomSample(relation.getDBIDs(), 1, random.nextLong()).iterator().next();
+ means.add(relation.get(first).getColumnVector());
+
+ ModifiableDBIDs chosen = DBIDUtil.newHashSet(k);
+ chosen.add(first);
+ ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
+ // Initialize weights
+ double[] weights = new double[ids.size()];
+ double weightsum = initialWeights(weights, ids, first, distQ);
+ while(means.size() < k) {
+ if(weightsum > Double.MAX_VALUE) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - too many data points, too large squared distances?");
+ }
+ if(weightsum < Double.MIN_NORMAL) {
+ LoggingUtil.warning("Could not choose a reasonable mean for k-means++ - to few data points?");
+ }
+ double r = random.nextDouble() * weightsum;
+ int pos = 0;
+ while(r > 0 && pos < weights.length) {
+ r -= weights[pos];
+ pos++;
+ }
+ // Add new mean:
+ DBID newmean = ids.get(pos);
+ means.add(relation.get(newmean).getColumnVector());
+ chosen.add(newmean);
+ // Update weights:
+ weights[pos] = 0.0;
+ // Choose optimized version for double distances, if applicable.
+ if (distF instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ PrimitiveDoubleDistanceFunction<V> ddist = (PrimitiveDoubleDistanceFunction<V>) distF;
+ weightsum = updateWeights(weights, ids, newmean, ddist, relation);
+ } else {
+ weightsum = updateWeights(weights, ids, newmean, distQ);
+ }
+ }
+
+ return means;
+ }
+
+ /**
+ * Initialize the weight list.
+ *
+ * @param weights Weight list
+ * @param ids IDs
+ * @param latest Added ID
+ * @param distQ Distance query
+ * @return Weight sum
+ */
+ protected double initialWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ double weightsum = 0.0;
+ DBIDIter it = ids.iter();
+ for(int i = 0; i < weights.length; i++, it.advance()) {
+ DBID id = it.getDBID();
+ if(latest.equals(id)) {
+ weights[i] = 0.0;
+ }
+ else {
+ double d = distQ.distance(latest, id).doubleValue();
+ weights[i] = d * d;
+ }
+ weightsum += weights[i];
+ }
+ return weightsum;
+ }
+
+ /**
+ * Update the weight list.
+ *
+ * @param weights Weight list
+ * @param ids IDs
+ * @param latest Added ID
+ * @param distQ Distance query
+ * @return Weight sum
+ */
+ protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, DistanceQuery<V, D> distQ) {
+ double weightsum = 0.0;
+ DBIDIter it = ids.iter();
+ for(int i = 0; i < weights.length; i++, it.advance()) {
+ DBID id = it.getDBID();
+ if(weights[i] > 0.0) {
+ double d = distQ.distance(latest, id).doubleValue();
+ weights[i] = Math.min(weights[i], d * d);
+ weightsum += weights[i];
+ }
+ }
+ return weightsum;
+ }
+
+ /**
+ * Update the weight list.
+ *
+ * @param weights Weight list
+ * @param ids IDs
+ * @param latest Added ID
+ * @param distF Distance function
+ * @return Weight sum
+ */
+ protected double updateWeights(double[] weights, ArrayDBIDs ids, DBID latest, PrimitiveDoubleDistanceFunction<V> distF, Relation<V> rel) {
+ final V lv = rel.get(latest);
+ double weightsum = 0.0;
+ DBIDIter it = ids.iter();
+ for(int i = 0; i < weights.length; i++, it.advance()) {
+ DBID id = it.getDBID();
+ if(weights[i] > 0.0) {
+ double d = distF.doubleDistance(lv, rel.get(id));
+ weights[i] = Math.min(weights[i], d * d);
+ weightsum += weights[i];
+ }
+ }
+ return weightsum;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+ @Override
+ protected KMeansPlusPlusInitialMeans<V, D> makeInstance() {
+ return new KMeansPlusPlusInitialMeans<V, D>(seed);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
new file mode 100644
index 00000000..30e59453
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java
@@ -0,0 +1,78 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Initialize K-means by randomly choosing k exsiting elements as cluster
+ * centers.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class RandomlyChosenInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public RandomlyChosenInitialMeans(Long seed) {
+ super(seed);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, seed);
+ List<Vector> means = new ArrayList<Vector>(k);
+ for(DBID id : ids) {
+ means.add(relation.get(id).getColumnVector());
+ }
+ return means;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+
+ @Override
+ protected RandomlyChosenInitialMeans<V> makeInstance() {
+ return new RandomlyChosenInitialMeans<V>(seed);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
new file mode 100644
index 00000000..e8a466dd
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -0,0 +1,87 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * Initialize k-means by generating random vectors (within the data sets value
+ * range).
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Vector type
+ */
+public class RandomlyGeneratedInitialMeans<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization<V> {
+ /**
+ * Constructor.
+ *
+ * @param seed Random seed.
+ */
+ public RandomlyGeneratedInitialMeans(Long seed) {
+ super(seed);
+ }
+
+ @Override
+ public List<Vector> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) {
+ final int dim = DatabaseUtil.dimensionality(relation);
+ Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
+ List<Vector> means = new ArrayList<Vector>(k);
+ final Random random = (this.seed != null) ? new Random(this.seed) : new Random();
+ for(int i = 0; i < k; i++) {
+ double[] r = MathUtil.randomDoubleArray(dim, random);
+ // Rescale
+ for(int d = 0; d < dim; d++) {
+ r[d] = minmax.first.doubleValue(d + 1) + (minmax.second.doubleValue(d + 1) - minmax.first.doubleValue(d + 1)) * r[d];
+ }
+ means.add(new Vector(r));
+ }
+ return means;
+ }
+
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractKMeansInitialization.Parameterizer<V> {
+
+ @Override
+ protected RandomlyGeneratedInitialMeans<V> makeInstance() {
+ return new RandomlyGeneratedInitialMeans<V>(seed);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
new file mode 100644
index 00000000..2ce625b0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>K-means clustering and variations.</p>
+ */
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2012
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java
index 660a7a4f..eed031df 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java
@@ -1,5 +1,5 @@
/**
- * <p>Clustering algorithms</p>
+ * <p>Clustering algorithms.</p>
*
* Clustering algorithms are supposed to implement the {@link de.lmu.ifi.dbs.elki.algorithm.Algorithm}-Interface.
* The more specialized interface {@link de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm}
@@ -15,7 +15,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
index dfc4e1cd..e3b274a6 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
index 987c7eda..c4c1687b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
index 36473cc0..40ab60a8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
@@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
index 92c2248c..3f16e907 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java
index 22e9c150..4ca5a564 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
index c0edb2ea..963c0922 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -44,7 +44,7 @@ import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.AbstractDimensionsSelectingDoubleDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionsSelectingEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
@@ -94,7 +94,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus
/**
* The distance function to determine the distance between database objects.
* <p>
- * Default value: {@link DimensionsSelectingEuclideanDistanceFunction}
+ * Default value: {@link SubspaceEuclideanDistanceFunction}
* </p>
* <p>
* Key: {@code -subclu.distancefunction}
@@ -477,7 +477,7 @@ public class SUBCLU<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clus
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, DimensionsSelectingEuclideanDistanceFunction.class);
+ ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, SubspaceEuclideanDistanceFunction.class);
if(config.grab(param)) {
distance = param.instantiateClass(config);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java
index 1874c9e8..eff71a35 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java
index 4b6fa9ad..db687567 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
/**
@@ -265,7 +266,7 @@ public class CLIQUEUnit<V extends NumberVector<V, ?>> {
resultIntervals.add(this.intervals.last());
resultIntervals.add(other.intervals.last());
- ModifiableDBIDs resultIDs = DBIDUtil.newHashSet(this.ids);
+ HashSetModifiableDBIDs resultIDs = DBIDUtil.newHashSet(this.ids);
resultIDs.retainAll(other.ids);
if(resultIDs.size() / all >= tau) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java
index 444cb0e6..7a686190 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java
@@ -7,7 +7,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java
index 168ceadb..2a1eb930 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java
@@ -10,7 +10,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java
index 02350db3..43c6a218 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -85,9 +85,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl
public static final OptionID MULTIPLE_ID = OptionID.getOrCreateOptionID("bylabelclustering.multiple", "Flag to indicate that only subspaces with large coverage " + "(i.e. the fraction of the database that is covered by the dense units) " + "are selected, the rest will be pruned.");
/**
- * Flag to indicate that multiple cluster assignment is possible. If an
- * assignment to multiple clusters is desired, the labels indicating the
- * clusters need to be separated by blanks.
+ * Pattern to recognize noise clusters by.
*/
public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("bylabelclustering.noise", "Pattern to recognize noise classes by their label.");
@@ -144,7 +142,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl
ModifiableDBIDs noiseids = DBIDUtil.newArray();
Clustering<Model> result = new Clustering<Model>("By Label Clustering", "bylabel-clustering");
for(Entry<String, ModifiableDBIDs> entry : labelMap.entrySet()) {
- ModifiableDBIDs ids = labelMap.get(entry.getKey());
+ ModifiableDBIDs ids = entry.getValue();
if(ids.size() <= 1) {
noiseids.addDBIDs(ids);
continue;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java
index 5b8041d7..228cc7e7 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java
new file mode 100644
index 00000000..cd45cda2
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java
@@ -0,0 +1,163 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.Model;
+import de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorInterface;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
+
+/**
+ * Pseudo clustering using annotated models.
+ *
+ * This "algorithm" puts elements into the same cluster when they agree in their
+ * model. I.e. it just uses a predefined clustering, and is mostly useful for
+ * testing and evaluation (e.g. comparing the result of a real algorithm to the
+ * reference result / golden standard used by the generator).
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.uses Model
+ */
+@Title("Clustering by model")
+@Description("Cluster points by a (pre-assigned!) model. For comparing results with a reference clustering.")
+public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(ByModelClustering.class);
+
+ /**
+ * Pattern to recognize noise clusters with
+ */
+ public static final OptionID NOISE_ID = OptionID.getOrCreateOptionID("bymodel.noise", "Pattern to recognize noise models by their label.");
+
+ /**
+ * Holds the value of {@link #NOISE_ID}.
+ */
+ private Pattern noisepattern = null;
+
+ /**
+ * Constructor.
+ *
+ * @param noisepattern Noise pattern
+ */
+ public ByModelClustering(Pattern noisepattern) {
+ super();
+ this.noisepattern = noisepattern;
+ }
+
+ /**
+ * Constructor without parameters
+ */
+ public ByModelClustering() {
+ this(null);
+ }
+
+ /**
+ * Run the actual clustering algorithm.
+ *
+ * @param relation The data input we use
+ */
+ public Clustering<Model> run(Relation<Model> relation) {
+ // Build model mapping
+ HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<Model, ModifiableDBIDs>();
+ for(DBID id : relation.iterDBIDs()) {
+ Model model = relation.get(id);
+ ModifiableDBIDs modelids = modelMap.get(model);
+ if(modelids == null) {
+ modelids = DBIDUtil.newHashSet();
+ modelMap.put(model, modelids);
+ }
+ modelids.add(id);
+ }
+
+ Clustering<Model> result = new Clustering<Model>("By Model Clustering", "bymodel-clustering");
+ for(Entry<Model, ModifiableDBIDs> entry : modelMap.entrySet()) {
+ final Model model = entry.getKey();
+ final ModifiableDBIDs ids = entry.getValue();
+ final String name = (model instanceof GeneratorInterface) ? ((GeneratorInterface) model).getName() : model.toString();
+ Cluster<Model> c = new Cluster<Model>(name, ids, model);
+ if(noisepattern != null && noisepattern.matcher(name).find()) {
+ c.setNoise(true);
+ }
+ result.addCluster(c);
+ }
+ return result;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.MODEL);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ protected Pattern noisepat;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ PatternParameter noisepatP = new PatternParameter(NOISE_ID, true);
+ if(config.grab(noisepatP)) {
+ noisepat = noisepatP.getValue();
+ }
+ }
+
+ @Override
+ protected ByModelClustering makeInstance() {
+ return new ByModelClustering(noisepat);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java
index a316ce57..2e7d006d 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java
index b85f5445..c497632c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2011
+ Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java
index 5629855c..5870a736 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java
@@ -7,7 +7,7 @@
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
-Copyright (C) 2011
+Copyright (C) 2012
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team