summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java546
1 files changed, 306 insertions, 240 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
index 03e9978f..805281b0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -28,7 +28,6 @@ import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import java.util.ArrayList;
-import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
@@ -44,25 +43,32 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultUtil;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.Mean;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
-import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
@@ -71,13 +77,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.CTriple;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
/**
* <p/>
- * Provides the PROCLUS algorithm, an algorithm to find subspace clusters in
- * high dimensional spaces.
+ * The PROCLUS algorithm, an algorithm to find subspace clusters in high
+ * dimensional spaces.
* </p>
* <p/>
* Reference: <br>
@@ -92,30 +97,20 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
-// TODO: optimize by creating much less objects
@Title("PROCLUS: PROjected CLUStering")
@Description("Algorithm to find subspace clusters in high dimensional spaces.")
-@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188")
-public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", //
+title = "Fast Algorithms for Projected Clustering", //
+booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", //
+url = "http://dx.doi.org/10.1145/304181.304188")
+public class PROCLUS<V extends NumberVector> extends AbstractProjectedClustering<Clustering<SubspaceModel>, V> implements SubspaceClusteringAlgorithm<SubspaceModel> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(PROCLUS.class);
/**
- * Parameter to specify the multiplier for the initial number of medoids, must
- * be an integer greater than 0.
- * <p>
- * Default value: {@code 10}
- * </p>
- * <p>
- * Key: {@code -proclus.mi}
- * </p>
- */
- public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids.");
-
- /**
- * Holds the value of {@link #M_I_ID}.
+ * Multiplier for the initial number of medoids.
*/
private int m_i;
@@ -145,27 +140,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param database Database to process
* @param relation Relation to process
*/
- public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
- DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database);
- RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
+ public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
+ DistanceQuery<V> distFunc = this.getDistanceQuery(database);
+ RangeQuery<V> rangeQuery = database.getRangeQuery(distFunc);
final Random random = rnd.getSingleThreadedRandom();
- if (RelationUtil.dimensionality(relation) < l) {
+ if(RelationUtil.dimensionality(relation) < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")");
}
// TODO: use a StepProgress!
// initialization phase
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("1. Initialization phase...");
}
int sampleSize = Math.min(relation.size(), k_i * k);
DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random.nextLong());
int medoidSize = Math.min(relation.size(), m_i * k);
- DBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random);
+ ArrayDBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("sampleSize ").append(sampleSize).append('\n');
@@ -176,15 +171,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
}
// iterative phase
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("2. Iterative phase...");
}
double bestObjective = Double.POSITIVE_INFINITY;
- ModifiableDBIDs m_best = null;
- ModifiableDBIDs m_bad = null;
- ModifiableDBIDs m_current = initialSet(medoids, k, random);
+ ArrayDBIDs m_best = null;
+ DBIDs m_bad = null;
+ ArrayDBIDs m_current = initialSet(medoids, k, random);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("m_c ").append(m_current).append('\n');
@@ -193,47 +188,44 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null;
- // TODO: Use DataStore and Trove for performance
- Map<DBID, PROCLUSCluster> clusters = null;
+ ArrayList<PROCLUSCluster> clusters = null;
int loops = 0;
- while (loops < 10) {
- Map<DBID, TIntSet> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery);
- clusters = assignPoints(dimensions, relation);
+ while(loops < 10) {
+ TIntSet[] dimensions = findDimensions(m_current, relation, distFunc, rangeQuery);
+ clusters = assignPoints(m_current, dimensions, relation);
double objectiveFunction = evaluateClusters(clusters, dimensions, relation);
- if (objectiveFunction < bestObjective) {
+ if(objectiveFunction < bestObjective) {
// restart counting loops
loops = 0;
bestObjective = objectiveFunction;
m_best = m_current;
- m_bad = computeBadMedoids(clusters, (int) (relation.size() * 0.1 / k));
+ m_bad = computeBadMedoids(m_current, clusters, (int) (relation.size() * 0.1 / k));
}
m_current = computeM_current(medoids, m_best, m_bad, random);
loops++;
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
}
- if (cprogress != null) {
- cprogress.setCompleted(LOG);
- }
+ LOG.setCompleted(cprogress);
// refinement phase
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("3. Refinement phase...");
}
- List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<>(clusters.values()), relation);
+ List<Pair<Vector, TIntSet>> dimensions = findDimensions(clusters, relation);
List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation);
// build result
int numClusters = 1;
- Clustering<SubspaceModel<V>> result = new Clustering<>("ProClus clustering", "proclus-clustering");
- for (PROCLUSCluster c : finalClusters) {
- Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.objectIDs);
- cluster.setModel(new SubspaceModel<>(new Subspace(c.getDimensions()), c.centroid));
+ Clustering<SubspaceModel> result = new Clustering<>("ProClus clustering", "proclus-clustering");
+ for(PROCLUSCluster c : finalClusters) {
+ Cluster<SubspaceModel> cluster = new Cluster<>(c.objectIDs);
+ cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid));
cluster.setName("cluster_" + numClusters++);
result.addToplevelCluster(cluster);
@@ -250,48 +242,64 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param random random number generator
* @return a piercing set of m medoids from the specified sample set
*/
- private ModifiableDBIDs greedy(DistanceQuery<V, DoubleDistance> distFunc, DBIDs sampleSet, int m, Random random) {
+ private ArrayDBIDs greedy(DistanceQuery<V> distFunc, DBIDs sampleSet, int m, Random random) {
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(m);
+
ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet);
- ModifiableDBIDs medoids = DBIDUtil.newHashSet();
+ DBIDArrayIter iter = s.iter();
+ DBIDVar m_i = DBIDUtil.newVar();
+ int size = s.size();
// m_1 is random point of S
- DBID m_i = s.remove(random.nextInt(s.size()));
+ final int r = random.nextInt(size);
+ s.assignVar(r, m_i);
medoids.add(m_i);
- if (LOG.isDebugging()) {
- LOG.debugFiner("medoids " + medoids);
+ if(LOG.isDebugging()) {
+ LOG.debugFiner("medoids " + medoids.toString());
}
+ // Remove m_i from candidates, by moving to the end.
+ s.swap(r, size - 1);
+ --size;
+
+ // To track the current worst element:
+ int worst = -1;
+ double worstd = Double.NEGATIVE_INFINITY;
// compute distances between each point in S and m_i
- // FIXME: don't use maps, so we can work with DBIDRef
- Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<>();
- for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) {
- DBID id = DBIDUtil.deref(iter);
- DoubleDistance dist = distFunc.distance(id, m_i);
- distances.put(id, DBIDUtil.newDistancePair(dist, id));
+ WritableDoubleDataStore distances = DataStoreUtil.makeDoubleStorage(s, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ for(iter.seek(0); iter.getOffset() < size; iter.advance()) {
+ final double dist = distFunc.distance(iter, m_i);
+ distances.putDouble(iter, dist);
+ if(dist > worstd) {
+ worstd = dist;
+ worst = iter.getOffset();
+ }
}
- for (int i = 1; i < m; i++) {
+ for(int i = 1; i < m; i++) {
// choose medoid m_i to be far from previous medoids
- List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<>(distances.values());
- DistanceDBIDResultUtil.sortByDistance(d);
-
- m_i = DBIDUtil.deref(d.get(d.size() - 1));
+ s.assignVar(worst, m_i);
medoids.add(m_i);
- s.remove(m_i);
- distances.remove(m_i);
-
- // compute distances of each point to closest medoid
- for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) {
- DBID id = DBIDUtil.deref(iter);
- DoubleDistance dist_new = distFunc.distance(id, m_i);
- DoubleDistance dist_old = distances.get(id).getDistance();
-
- DoubleDistance dist = dist_new.compareTo(dist_old) < 0 ? dist_new : dist_old;
- distances.put(id, DBIDUtil.newDistancePair(dist, id));
+ // Remove m_i from candidates, by moving to the end.
+ s.swap(worst, size - 1);
+ --size;
+
+ // compute distances of each point to closest medoid; track worst.
+ worst = -1;
+ worstd = Double.NEGATIVE_INFINITY;
+ for(iter.seek(0); iter.getOffset() < size; iter.advance()) {
+ double dist_new = distFunc.distance(iter, m_i);
+ double dist_old = distances.doubleValue(iter);
+ double dist = (dist_new < dist_old) ? dist_new : dist_old;
+ distances.putDouble(iter, dist);
+ if(dist > worstd) {
+ worstd = dist;
+ worst = iter.getOffset();
+ }
}
- if (LOG.isDebugging()) {
- LOG.debugFiner("medoids " + medoids);
+ if(LOG.isDebugging()) {
+ LOG.debugFiner("medoids " + medoids.toString());
}
}
@@ -306,14 +314,8 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param random random number generator
* @return a set of k elements from the specified sample set
*/
- private ModifiableDBIDs initialSet(DBIDs sampleSet, int k, Random random) {
- ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet);
- ModifiableDBIDs initialSet = DBIDUtil.newHashSet();
- while (initialSet.size() < k) {
- DBID next = s.remove(random.nextInt(s.size()));
- initialSet.add(next);
- }
- return initialSet;
+ private ArrayDBIDs initialSet(DBIDs sampleSet, int k, Random random) {
+ return DBIDUtil.ensureArray(DBIDUtil.randomSample(sampleSet, k, random));
}
/**
@@ -325,21 +327,21 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param random random number generator
* @return m_current, the set of medoids in current iteration
*/
- private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) {
+ private ArrayDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) {
ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m);
m_list.removeDBIDs(m_best);
- ModifiableDBIDs m_current = DBIDUtil.newHashSet();
- for (DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) {
- DBID m_i = DBIDUtil.deref(iter);
- if (m_bad.contains(m_i)) {
+ ArrayModifiableDBIDs m_current = DBIDUtil.newArray();
+ for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) {
+ if(m_bad.contains(iter)) {
int currentSize = m_current.size();
- while (m_current.size() == currentSize) {
+ while(m_current.size() == currentSize) {
DBID next = m_list.remove(random.nextInt(m_list.size()));
m_current.add(next);
}
- } else {
- m_current.add(m_i);
+ }
+ else {
+ m_current.add(iter);
}
}
@@ -357,29 +359,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param distFunc the distance function
* @return a mapping of the medoid's id to its locality
*/
- private Map<DBID, DistanceDBIDList<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) {
- Map<DBID, DistanceDBIDList<DoubleDistance>> result = new HashMap<>();
+ private DataStore<DoubleDBIDList> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) {
+ WritableDataStore<DoubleDBIDList> result = DataStoreUtil.makeStorage(medoids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, DoubleDBIDList.class);
- for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
- DBID m = DBIDUtil.deref(iter);
+ for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
// determine minimum distance between current medoid m and any other
// medoid m_i
- DoubleDistance minDist = null;
- for (DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) {
- DBID m_i = DBIDUtil.deref(iter2);
- if (DBIDUtil.equal(m_i, m)) {
+ double minDist = Double.POSITIVE_INFINITY;
+ for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) {
+ if(DBIDUtil.equal(iter, iter2)) {
continue;
}
- DoubleDistance currentDist = distFunc.distance(m, m_i);
- if (minDist == null || currentDist.compareTo(minDist) < 0) {
+ double currentDist = distFunc.distance(iter, iter2);
+ if(currentDist < minDist) {
minDist = currentDist;
}
}
// determine points in sphere centered at m with radius minDist
- assert minDist != null;
- DistanceDBIDList<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist);
- result.put(m, qr);
+ assert minDist != Double.POSITIVE_INFINITY;
+ DoubleDBIDList qr = rangeQuery.getRangeForDBID(iter, minDist);
+ result.put(iter, qr);
}
return result;
@@ -395,68 +395,67 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the set of correlated dimensions for each medoid in the specified
* medoid set
*/
- private Map<DBID, TIntSet> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) {
+ private TIntSet[] findDimensions(ArrayDBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) {
// get localities
- Map<DBID, DistanceDBIDList<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery);
+ DataStore<DoubleDBIDList> localities = getLocalities(medoids, database, distFunc, rangeQuery);
// compute x_ij = avg distance from points in l_i to medoid m_i
int dim = RelationUtil.dimensionality(database);
- Map<DBID, double[]> averageDistances = new HashMap<>();
+ double[][] averageDistances = new double[medoids.size()][];
- for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
- DBID m_i = DBIDUtil.deref(iter);
- V medoid_i = database.get(m_i);
- DistanceDBIDList<DoubleDistance> l_i = localities.get(m_i);
+ int i = 0;
+ for(DBIDArrayIter iter = medoids.iter(); iter.valid(); iter.advance(), i++) {
+ V medoid_i = database.get(iter);
+ DoubleDBIDList l_i = localities.get(iter);
double[] x_i = new double[dim];
- for (DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) {
+ for(DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) {
V o = database.get(qr);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] += Math.abs(medoid_i.doubleValue(d) - o.doubleValue(d));
}
}
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] /= l_i.size();
}
- averageDistances.put(m_i, x_i);
+ averageDistances[i] = x_i;
}
- Map<DBID, TIntSet> dimensionMap = new HashMap<>();
- List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<>();
- for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) {
- DBID m_i = DBIDUtil.deref(iter);
+ TIntSet[] dimensionMap = new TIntSet[medoids.size()];
+ List<DoubleIntInt> z_ijs = new ArrayList<>();
+ for(i = 0; i < medoids.size(); i++) {
TIntSet dims_i = new TIntHashSet();
- dimensionMap.put(m_i, dims_i);
+ dimensionMap[i] = dims_i;
- double[] x_i = averageDistances.get(m_i);
+ double[] x_i = averageDistances[i];
// y_i
double y_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
y_i += x_i[j];
}
y_i /= dim;
// sigma_i
double sigma_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
double diff = x_i[j] - y_i;
sigma_i += diff * diff;
}
sigma_i /= (dim - 1);
sigma_i = Math.sqrt(sigma_i);
- for (int j = 0; j < dim; j++) {
- z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, m_i, j));
+ for(int j = 0; j < dim; j++) {
+ z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j));
}
}
Collections.sort(z_ijs);
int max = Math.max(k * l, 2);
- for (int m = 0; m < max; m++) {
- CTriple<Double, DBID, Integer> z_ij = z_ijs.get(m);
- TIntSet dims_i = dimensionMap.get(z_ij.getSecond());
- dims_i.add(z_ij.getThird());
+ for(int m = 0; m < max; m++) {
+ DoubleIntInt z_ij = z_ijs.get(m);
+ TIntSet dims_i = dimensionMap[z_ij.dimi];
+ dims_i.add(z_ij.dimj);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("z_ij ").append(z_ij).append('\n');
@@ -476,64 +475,65 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the set of correlated dimensions for each specified cluster
* centroid
*/
- private List<Pair<V, TIntSet>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) {
+ private List<Pair<Vector, TIntSet>> findDimensions(ArrayList<PROCLUSCluster> clusters, Relation<V> database) {
// compute x_ij = avg distance from points in c_i to c_i.centroid
int dim = RelationUtil.dimensionality(database);
- Map<Integer, double[]> averageDistances = new HashMap<>();
+ final int numc = clusters.size();
+ double[][] averageDistances = new double[numc][];
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < numc; i++) {
PROCLUSCluster c_i = clusters.get(i);
double[] x_i = new double[dim];
- for (DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) {
V o = database.get(iter);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] += Math.abs(c_i.centroid.doubleValue(d) - o.doubleValue(d));
}
}
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
x_i[d] /= c_i.objectIDs.size();
}
- averageDistances.put(i, x_i);
+ averageDistances[i] = x_i;
}
- List<CTriple<Double, Integer, Integer>> z_ijs = new ArrayList<>();
- for (int i = 0; i < clusters.size(); i++) {
- double[] x_i = averageDistances.get(i);
+ List<DoubleIntInt> z_ijs = new ArrayList<>();
+ for(int i = 0; i < numc; i++) {
+ double[] x_i = averageDistances[i];
// y_i
double y_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
y_i += x_i[j];
}
y_i /= dim;
// sigma_i
double sigma_i = 0;
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
double diff = x_i[j] - y_i;
sigma_i += diff * diff;
}
sigma_i /= (dim - 1);
sigma_i = Math.sqrt(sigma_i);
- for (int j = 0; j < dim; j++) {
- z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, i, j));
+ for(int j = 0; j < dim; j++) {
+ z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j));
}
}
Collections.sort(z_ijs);
// mapping cluster index -> dimensions
- Map<Integer, TIntSet> dimensionMap = new HashMap<>();
+ TIntSet[] dimensionMap = new TIntSet[numc];
int max = Math.max(k * l, 2);
- for (int m = 0; m < max; m++) {
- CTriple<Double, Integer, Integer> z_ij = z_ijs.get(m);
- TIntSet dims_i = dimensionMap.get(z_ij.getSecond());
- if (dims_i == null) {
+ for(int m = 0; m < max; m++) {
+ DoubleIntInt z_ij = z_ijs.get(m);
+ TIntSet dims_i = dimensionMap[z_ij.dimi];
+ if(dims_i == null) {
dims_i = new TIntHashSet();
- dimensionMap.put(z_ij.getSecond(), dims_i);
+ dimensionMap[z_ij.dimi] = dims_i;
}
- dims_i.add(z_ij.getThird());
+ dims_i.add(z_ij.dimj);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("z_ij ").append(z_ij).append('\n');
@@ -543,9 +543,12 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
}
// mapping cluster -> dimensions
- List<Pair<V, TIntSet>> result = new ArrayList<>();
- for (int i : dimensionMap.keySet()) {
- TIntSet dims_i = dimensionMap.get(i);
+ List<Pair<Vector, TIntSet>> result = new ArrayList<>();
+ for(int i = 0; i < numc; i++) {
+ TIntSet dims_i = dimensionMap[i];
+ if(dims_i == null) {
+ continue;
+ }
PROCLUSCluster c_i = clusters.get(i);
result.add(new Pair<>(c_i.centroid, dims_i));
}
@@ -555,45 +558,51 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
/**
* Assigns the objects to the clusters.
*
+ * @param m_current Current centers
* @param dimensions set of correlated dimensions for each medoid of the
* cluster
* @param database the database containing the objects
* @return the assignments of the object to the clusters
*/
- private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, TIntSet> dimensions, Relation<V> database) {
- Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<>();
- for (DBID m_i : dimensions.keySet()) {
- clusterIDs.put(m_i, DBIDUtil.newHashSet());
- }
-
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
- DBID p_id = DBIDUtil.deref(it);
- V p = database.get(p_id);
- DistanceDBIDPair<DoubleDistance> minDist = null;
- for (DBID m_i : dimensions.keySet()) {
+ private ArrayList<PROCLUSCluster> assignPoints(ArrayDBIDs m_current, TIntSet[] dimensions, Relation<V> database) {
+ ModifiableDBIDs[] clusterIDs = new ModifiableDBIDs[dimensions.length];
+ for(int i = 0; i < m_current.size(); i++) {
+ clusterIDs[i] = DBIDUtil.newHashSet();
+ }
+
+ DBIDArrayIter m_i = m_current.iter();
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ V p = database.get(it);
+ double minDist = Double.NaN;
+ int best = -1, i = 0;
+ for(m_i.seek(0); m_i.valid(); m_i.advance(), i++) {
V m = database.get(m_i);
- DistanceDBIDPair<DoubleDistance> currentDist = DBIDUtil.newDistancePair(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i);
- if (minDist == null || currentDist.compareByDistance(minDist) < 0) {
+ double currentDist = manhattanSegmentalDistance(p, m, dimensions[i]);
+ if(!(minDist <= currentDist)) {
minDist = currentDist;
+ best = i;
}
}
// add p to cluster with mindist
- assert minDist != null;
- ModifiableDBIDs ids = clusterIDs.get(DBIDUtil.deref(minDist));
- ids.add(p_id);
+ assert best >= 0;
+ ModifiableDBIDs ids = clusterIDs[best];
+ ids.add(it);
}
- Map<DBID, PROCLUSCluster> clusters = new HashMap<>();
- for (DBID m_i : dimensions.keySet()) {
- ModifiableDBIDs objectIDs = clusterIDs.get(m_i);
- if (!objectIDs.isEmpty()) {
- TIntSet clusterDimensions = dimensions.get(m_i);
- V centroid = Centroid.make(database, objectIDs).toVector(database);
- clusters.put(m_i, new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
+ ArrayList<PROCLUSCluster> clusters = new ArrayList<>(m_current.size());
+ for(int i = 0; i < dimensions.length; i++) {
+ ModifiableDBIDs objectIDs = clusterIDs[i];
+ if(!objectIDs.isEmpty()) {
+ TIntSet clusterDimensions = dimensions[i];
+ Vector centroid = Centroid.make(database, objectIDs);
+ clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
+ }
+ else {
+ clusters.add(null);
}
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("clusters ").append(clusters).append('\n');
@@ -610,42 +619,43 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param database the database containing the objects
* @return the assignments of the object to the clusters
*/
- private List<PROCLUSCluster> finalAssignment(List<Pair<V, TIntSet>> dimensions, Relation<V> database) {
+ private List<PROCLUSCluster> finalAssignment(List<Pair<Vector, TIntSet>> dimensions, Relation<V> database) {
Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>();
- for (int i = 0; i < dimensions.size(); i++) {
+ for(int i = 0; i < dimensions.size(); i++) {
clusterIDs.put(i, DBIDUtil.newHashSet());
}
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
- DBID p_id = DBIDUtil.deref(it);
- V p = database.get(p_id);
- Pair<DoubleDistance, Integer> minDist = null;
- for (int i = 0; i < dimensions.size(); i++) {
- Pair<V, TIntSet> pair_i = dimensions.get(i);
- V c_i = pair_i.first;
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ V p = database.get(it);
+ double minDist = Double.POSITIVE_INFINITY;
+ int best = -1;
+ for(int i = 0; i < dimensions.size(); i++) {
+ Pair<Vector, TIntSet> pair_i = dimensions.get(i);
+ Vector c_i = pair_i.first;
TIntSet dimensions_i = pair_i.second;
- DoubleDistance currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i);
- if (minDist == null || currentDist.compareTo(minDist.first) < 0) {
- minDist = new Pair<>(currentDist, i);
+ double currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i);
+ if(best < 0 || currentDist < minDist) {
+ minDist = currentDist;
+ best = i;
}
}
// add p to cluster with mindist
- assert minDist != null;
- ModifiableDBIDs ids = clusterIDs.get(minDist.second);
- ids.add(p_id);
+ assert minDist >= 0.;
+ ModifiableDBIDs ids = clusterIDs.get(best);
+ ids.add(it);
}
List<PROCLUSCluster> clusters = new ArrayList<>();
- for (int i = 0; i < dimensions.size(); i++) {
+ for(int i = 0; i < dimensions.size(); i++) {
ModifiableDBIDs objectIDs = clusterIDs.get(i);
- if (!objectIDs.isEmpty()) {
+ if(!objectIDs.isEmpty()) {
TIntSet clusterDimensions = dimensions.get(i).second;
- V centroid = Centroid.make(database, objectIDs).toVector(database);
+ Vector centroid = Centroid.make(database, objectIDs);
clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
}
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append('\n');
msg.append("clusters ").append(clusters).append('\n');
@@ -664,14 +674,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the Manhattan segmental distance between o1 and o2 relative to the
* specified dimensions
*/
- private DoubleDistance manhattanSegmentalDistance(V o1, V o2, TIntSet dimensions) {
+ private double manhattanSegmentalDistance(NumberVector o1, NumberVector o2, TIntSet dimensions) {
double result = 0;
- for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
final int d = iter.next();
result += Math.abs(o1.doubleValue(d) - o2.doubleValue(d));
}
result /= dimensions.size();
- return new DoubleDistance(result);
+ return result;
}
/**
@@ -682,20 +692,20 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @param database the database holding the objects
* @return a measure for the cluster quality
*/
- private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, TIntSet> dimensions, Relation<V> database) {
+ private double evaluateClusters(ArrayList<PROCLUSCluster> clusters, TIntSet[] dimensions, Relation<V> database) {
double result = 0;
- for (DBID m_i : clusters.keySet()) {
- PROCLUSCluster c_i = clusters.get(m_i);
- V centroid_i = c_i.centroid;
+ for(int i = 0; i < dimensions.length; i++) {
+ PROCLUSCluster c_i = clusters.get(i);
+ Vector centroid_i = c_i.centroid;
- TIntSet dims_i = dimensions.get(m_i);
+ TIntSet dims_i = dimensions[i];
double w_i = 0;
- for (TIntIterator iter = dims_i.iterator(); iter.hasNext();) {
+ for(TIntIterator iter = dims_i.iterator(); iter.hasNext();) {
final int j = iter.next();
w_i += avgDistance(centroid_i, c_i.objectIDs, database, j);
}
- w_i /= dimensions.keySet().size();
+ w_i /= dimensions.length;
result += c_i.objectIDs.size() * w_i;
}
@@ -713,9 +723,9 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* @return the average distance of the objects to the centroid along the
* specified dimension
*/
- private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) {
+ private double avgDistance(Vector centroid, DBIDs objectIDs, Relation<V> database, int dimension) {
Mean avg = new Mean();
- for (DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) {
V o = database.get(iter);
avg.put(Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension)));
}
@@ -726,16 +736,18 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
* Computes the bad medoids, where the medoid of a cluster with less than the
* specified threshold of objects is bad.
*
+ * @param m_current Current medoids
* @param clusters the clusters
* @param threshold the threshold
* @return the bad medoids
*/
- private ModifiableDBIDs computeBadMedoids(Map<DBID, PROCLUSCluster> clusters, int threshold) {
- ModifiableDBIDs badMedoids = DBIDUtil.newHashSet();
- for (DBID m_i : clusters.keySet()) {
- PROCLUSCluster c_i = clusters.get(m_i);
- if (c_i.objectIDs.size() < threshold) {
- badMedoids.add(m_i);
+ private DBIDs computeBadMedoids(ArrayDBIDs m_current, ArrayList<PROCLUSCluster> clusters, int threshold) {
+ ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(m_current.size());
+ int i = 0;
+ for(DBIDIter it = m_current.iter(); it.valid(); it.advance(), i++) {
+ PROCLUSCluster c_i = clusters.get(i);
+ if(c_i == null || c_i.objectIDs.size() < threshold) {
+ badMedoids.add(it);
}
}
return badMedoids;
@@ -752,6 +764,36 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
}
/**
+ * Simple triple.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ private static class DoubleIntInt implements Comparable<DoubleIntInt> {
+ protected double first;
+
+ protected int dimi, dimj;
+
+ public DoubleIntInt(double first, int second, int third) {
+ this.first = first;
+ this.dimi = second;
+ this.dimj = third;
+ }
+
+ @Override
+ public int compareTo(DoubleIntInt o) {
+ if(this.first < o.first) {
+ return -1;
+ }
+ if(this.first > o.first) {
+ return +1;
+ }
+ return 0;
+ }
+ }
+
+ /**
* Encapsulates the attributes of a cluster.
*
* @apiviz.exclude
@@ -770,16 +812,16 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
/**
* The centroids of this cluster along each dimension.
*/
- V centroid;
+ Vector centroid;
/**
- * Provides a new cluster with the specified parameters.
+ * Constructor.
*
* @param objectIDs the ids of the objects belonging to this cluster
* @param dimensions the correlated dimensions of this cluster
* @param centroid the centroid of this cluster
*/
- public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, V centroid) {
+ public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, Vector centroid) {
this.objectIDs = objectIDs;
this.dimensions = dimensions;
this.centroid = centroid;
@@ -790,10 +832,11 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
StringBuilder result = new StringBuilder();
result.append("Dimensions: [");
boolean notFirst = false;
- for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
- if (notFirst) {
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ if(notFirst) {
result.append(',');
- } else {
+ }
+ else {
notFirst = true;
}
result.append(iter.next());
@@ -809,10 +852,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
*
* @return the correlated dimensions of this cluster as BitSet
*/
- public BitSet getDimensions() {
- BitSet result = new BitSet();
- for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
- result.set(iter.next());
+ public long[] getDimensions() {
+ int maxdim = 0;
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ int d = iter.next();
+ maxdim = (d > maxdim) ? d : maxdim;
+ }
+ long[] result = BitsUtil.zero(maxdim);
+ for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) {
+ BitsUtil.setI(result, iter.next());
}
return result;
}
@@ -825,14 +873,32 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedClustering.Parameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractProjectedClustering.Parameterizer {
+ /**
+ * Parameter to specify the multiplier for the initial number of medoids,
+ * must be an integer greater than 0.
+ * <p>
+ * Default value: {@code 10}
+ * </p>
+ * <p>
+ * Key: {@code -proclus.mi}
+ * </p>
+ */
+ public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids.");
+
/**
* Parameter to specify the random generator seed.
*/
public static final OptionID SEED_ID = new OptionID("proclus.seed", "The random number generator seed.");
+ /**
+ * Multiplier for the initial number of medoids.
+ */
protected int m_i = -1;
+ /**
+ * Random generator
+ */
protected RandomFactory rnd;
@Override
@@ -843,14 +909,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
configKI(config);
configL(config);
- IntParameter m_iP = new IntParameter(M_I_ID, 10);
- m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
- if (config.grab(m_iP)) {
+ IntParameter m_iP = new IntParameter(M_I_ID, 10) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(m_iP)) {
m_i = m_iP.getValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}