diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java | 546 |
1 files changed, 306 insertions, 240 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index 03e9978f..805281b0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -28,7 +28,6 @@ import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import java.util.ArrayList; -import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -44,25 +43,32 @@ import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultUtil; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; -import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -71,13 +77,12 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraint import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.CTriple; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * <p/> - * Provides the PROCLUS algorithm, an algorithm to find subspace clusters in - * high dimensional spaces. + * The PROCLUS algorithm, an algorithm to find subspace clusters in high + * dimensional spaces. * </p> * <p/> * Reference: <br> @@ -92,30 +97,20 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; * * @param <V> the type of NumberVector handled by this Algorithm */ -// TODO: optimize by creating much less objects @Title("PROCLUS: PROjected CLUStering") @Description("Algorithm to find subspace clusters in high dimensional spaces.") -@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", title = "Fast Algorithms for Projected Clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304188") -public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedClustering<Clustering<SubspaceModel<V>>, V> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> { +@Reference(authors = "C. C. Aggarwal, C. Procopiuc, J. L. Wolf, P. S. Yu, J. S. Park", // +title = "Fast Algorithms for Projected Clustering", // +booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", // +url = "http://dx.doi.org/10.1145/304181.304188") +public class PROCLUS<V extends NumberVector> extends AbstractProjectedClustering<Clustering<SubspaceModel>, V> implements SubspaceClusteringAlgorithm<SubspaceModel> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(PROCLUS.class); /** - * Parameter to specify the multiplier for the initial number of medoids, must - * be an integer greater than 0. - * <p> - * Default value: {@code 10} - * </p> - * <p> - * Key: {@code -proclus.mi} - * </p> - */ - public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids."); - - /** - * Holds the value of {@link #M_I_ID}. + * Multiplier for the initial number of medoids. */ private int m_i; @@ -145,27 +140,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param database Database to process * @param relation Relation to process */ - public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) { - DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database); - RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); + public Clustering<SubspaceModel> run(Database database, Relation<V> relation) { + DistanceQuery<V> distFunc = this.getDistanceQuery(database); + RangeQuery<V> rangeQuery = database.getRangeQuery(distFunc); final Random random = rnd.getSingleThreadedRandom(); - if (RelationUtil.dimensionality(relation) < l) { + if(RelationUtil.dimensionality(relation) < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")"); } // TODO: use a StepProgress! // initialization phase - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("1. Initialization phase..."); } int sampleSize = Math.min(relation.size(), k_i * k); DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random.nextLong()); int medoidSize = Math.min(relation.size(), m_i * k); - DBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random); + ArrayDBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("sampleSize ").append(sampleSize).append('\n'); @@ -176,15 +171,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } // iterative phase - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("2. Iterative phase..."); } double bestObjective = Double.POSITIVE_INFINITY; - ModifiableDBIDs m_best = null; - ModifiableDBIDs m_bad = null; - ModifiableDBIDs m_current = initialSet(medoids, k, random); + ArrayDBIDs m_best = null; + DBIDs m_bad = null; + ArrayDBIDs m_current = initialSet(medoids, k, random); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("m_c ").append(m_current).append('\n'); @@ -193,47 +188,44 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null; - // TODO: Use DataStore and Trove for performance - Map<DBID, PROCLUSCluster> clusters = null; + ArrayList<PROCLUSCluster> clusters = null; int loops = 0; - while (loops < 10) { - Map<DBID, TIntSet> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); - clusters = assignPoints(dimensions, relation); + while(loops < 10) { + TIntSet[] dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); + clusters = assignPoints(m_current, dimensions, relation); double objectiveFunction = evaluateClusters(clusters, dimensions, relation); - if (objectiveFunction < bestObjective) { + if(objectiveFunction < bestObjective) { // restart counting loops loops = 0; bestObjective = objectiveFunction; m_best = m_current; - m_bad = computeBadMedoids(clusters, (int) (relation.size() * 0.1 / k)); + m_bad = computeBadMedoids(m_current, clusters, (int) (relation.size() * 0.1 / k)); } m_current = computeM_current(medoids, m_best, m_bad, random); loops++; - if (cprogress != null) { + if(cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } } - if (cprogress != null) { - cprogress.setCompleted(LOG); - } + LOG.setCompleted(cprogress); // refinement phase - if (LOG.isVerbose()) { + if(LOG.isVerbose()) { LOG.verbose("3. Refinement phase..."); } - List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<>(clusters.values()), relation); + List<Pair<Vector, TIntSet>> dimensions = findDimensions(clusters, relation); List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation); // build result int numClusters = 1; - Clustering<SubspaceModel<V>> result = new Clustering<>("ProClus clustering", "proclus-clustering"); - for (PROCLUSCluster c : finalClusters) { - Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.objectIDs); - cluster.setModel(new SubspaceModel<>(new Subspace(c.getDimensions()), c.centroid)); + Clustering<SubspaceModel> result = new Clustering<>("ProClus clustering", "proclus-clustering"); + for(PROCLUSCluster c : finalClusters) { + Cluster<SubspaceModel> cluster = new Cluster<>(c.objectIDs); + cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid)); cluster.setName("cluster_" + numClusters++); result.addToplevelCluster(cluster); @@ -250,48 +242,64 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param random random number generator * @return a piercing set of m medoids from the specified sample set */ - private ModifiableDBIDs greedy(DistanceQuery<V, DoubleDistance> distFunc, DBIDs sampleSet, int m, Random random) { + private ArrayDBIDs greedy(DistanceQuery<V> distFunc, DBIDs sampleSet, int m, Random random) { + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(m); + ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet); - ModifiableDBIDs medoids = DBIDUtil.newHashSet(); + DBIDArrayIter iter = s.iter(); + DBIDVar m_i = DBIDUtil.newVar(); + int size = s.size(); // m_1 is random point of S - DBID m_i = s.remove(random.nextInt(s.size())); + final int r = random.nextInt(size); + s.assignVar(r, m_i); medoids.add(m_i); - if (LOG.isDebugging()) { - LOG.debugFiner("medoids " + medoids); + if(LOG.isDebugging()) { + LOG.debugFiner("medoids " + medoids.toString()); } + // Remove m_i from candidates, by moving to the end. + s.swap(r, size - 1); + --size; + + // To track the current worst element: + int worst = -1; + double worstd = Double.NEGATIVE_INFINITY; // compute distances between each point in S and m_i - // FIXME: don't use maps, so we can work with DBIDRef - Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<>(); - for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { - DBID id = DBIDUtil.deref(iter); - DoubleDistance dist = distFunc.distance(id, m_i); - distances.put(id, DBIDUtil.newDistancePair(dist, id)); + WritableDoubleDataStore distances = DataStoreUtil.makeDoubleStorage(s, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + for(iter.seek(0); iter.getOffset() < size; iter.advance()) { + final double dist = distFunc.distance(iter, m_i); + distances.putDouble(iter, dist); + if(dist > worstd) { + worstd = dist; + worst = iter.getOffset(); + } } - for (int i = 1; i < m; i++) { + for(int i = 1; i < m; i++) { // choose medoid m_i to be far from previous medoids - List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<>(distances.values()); - DistanceDBIDResultUtil.sortByDistance(d); - - m_i = DBIDUtil.deref(d.get(d.size() - 1)); + s.assignVar(worst, m_i); medoids.add(m_i); - s.remove(m_i); - distances.remove(m_i); - - // compute distances of each point to closest medoid - for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { - DBID id = DBIDUtil.deref(iter); - DoubleDistance dist_new = distFunc.distance(id, m_i); - DoubleDistance dist_old = distances.get(id).getDistance(); - - DoubleDistance dist = dist_new.compareTo(dist_old) < 0 ? dist_new : dist_old; - distances.put(id, DBIDUtil.newDistancePair(dist, id)); + // Remove m_i from candidates, by moving to the end. + s.swap(worst, size - 1); + --size; + + // compute distances of each point to closest medoid; track worst. + worst = -1; + worstd = Double.NEGATIVE_INFINITY; + for(iter.seek(0); iter.getOffset() < size; iter.advance()) { + double dist_new = distFunc.distance(iter, m_i); + double dist_old = distances.doubleValue(iter); + double dist = (dist_new < dist_old) ? dist_new : dist_old; + distances.putDouble(iter, dist); + if(dist > worstd) { + worstd = dist; + worst = iter.getOffset(); + } } - if (LOG.isDebugging()) { - LOG.debugFiner("medoids " + medoids); + if(LOG.isDebugging()) { + LOG.debugFiner("medoids " + medoids.toString()); } } @@ -306,14 +314,8 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param random random number generator * @return a set of k elements from the specified sample set */ - private ModifiableDBIDs initialSet(DBIDs sampleSet, int k, Random random) { - ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet); - ModifiableDBIDs initialSet = DBIDUtil.newHashSet(); - while (initialSet.size() < k) { - DBID next = s.remove(random.nextInt(s.size())); - initialSet.add(next); - } - return initialSet; + private ArrayDBIDs initialSet(DBIDs sampleSet, int k, Random random) { + return DBIDUtil.ensureArray(DBIDUtil.randomSample(sampleSet, k, random)); } /** @@ -325,21 +327,21 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param random random number generator * @return m_current, the set of medoids in current iteration */ - private ModifiableDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) { + private ArrayDBIDs computeM_current(DBIDs m, DBIDs m_best, DBIDs m_bad, Random random) { ArrayModifiableDBIDs m_list = DBIDUtil.newArray(m); m_list.removeDBIDs(m_best); - ModifiableDBIDs m_current = DBIDUtil.newHashSet(); - for (DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { - DBID m_i = DBIDUtil.deref(iter); - if (m_bad.contains(m_i)) { + ArrayModifiableDBIDs m_current = DBIDUtil.newArray(); + for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { + if(m_bad.contains(iter)) { int currentSize = m_current.size(); - while (m_current.size() == currentSize) { + while(m_current.size() == currentSize) { DBID next = m_list.remove(random.nextInt(m_list.size())); m_current.add(next); } - } else { - m_current.add(m_i); + } + else { + m_current.add(iter); } } @@ -357,29 +359,27 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param distFunc the distance function * @return a mapping of the medoid's id to its locality */ - private Map<DBID, DistanceDBIDList<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { - Map<DBID, DistanceDBIDList<DoubleDistance>> result = new HashMap<>(); + private DataStore<DoubleDBIDList> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) { + WritableDataStore<DoubleDBIDList> result = DataStoreUtil.makeStorage(medoids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, DoubleDBIDList.class); - for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m = DBIDUtil.deref(iter); + for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { // determine minimum distance between current medoid m and any other // medoid m_i - DoubleDistance minDist = null; - for (DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { - DBID m_i = DBIDUtil.deref(iter2); - if (DBIDUtil.equal(m_i, m)) { + double minDist = Double.POSITIVE_INFINITY; + for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { + if(DBIDUtil.equal(iter, iter2)) { continue; } - DoubleDistance currentDist = distFunc.distance(m, m_i); - if (minDist == null || currentDist.compareTo(minDist) < 0) { + double currentDist = distFunc.distance(iter, iter2); + if(currentDist < minDist) { minDist = currentDist; } } // determine points in sphere centered at m with radius minDist - assert minDist != null; - DistanceDBIDList<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist); - result.put(m, qr); + assert minDist != Double.POSITIVE_INFINITY; + DoubleDBIDList qr = rangeQuery.getRangeForDBID(iter, minDist); + result.put(iter, qr); } return result; @@ -395,68 +395,67 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the set of correlated dimensions for each medoid in the specified * medoid set */ - private Map<DBID, TIntSet> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { + private TIntSet[] findDimensions(ArrayDBIDs medoids, Relation<V> database, DistanceQuery<V> distFunc, RangeQuery<V> rangeQuery) { // get localities - Map<DBID, DistanceDBIDList<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery); + DataStore<DoubleDBIDList> localities = getLocalities(medoids, database, distFunc, rangeQuery); // compute x_ij = avg distance from points in l_i to medoid m_i int dim = RelationUtil.dimensionality(database); - Map<DBID, double[]> averageDistances = new HashMap<>(); + double[][] averageDistances = new double[medoids.size()][]; - for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m_i = DBIDUtil.deref(iter); - V medoid_i = database.get(m_i); - DistanceDBIDList<DoubleDistance> l_i = localities.get(m_i); + int i = 0; + for(DBIDArrayIter iter = medoids.iter(); iter.valid(); iter.advance(), i++) { + V medoid_i = database.get(iter); + DoubleDBIDList l_i = localities.get(iter); double[] x_i = new double[dim]; - for (DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { + for(DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { V o = database.get(qr); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] += Math.abs(medoid_i.doubleValue(d) - o.doubleValue(d)); } } - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] /= l_i.size(); } - averageDistances.put(m_i, x_i); + averageDistances[i] = x_i; } - Map<DBID, TIntSet> dimensionMap = new HashMap<>(); - List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<>(); - for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { - DBID m_i = DBIDUtil.deref(iter); + TIntSet[] dimensionMap = new TIntSet[medoids.size()]; + List<DoubleIntInt> z_ijs = new ArrayList<>(); + for(i = 0; i < medoids.size(); i++) { TIntSet dims_i = new TIntHashSet(); - dimensionMap.put(m_i, dims_i); + dimensionMap[i] = dims_i; - double[] x_i = averageDistances.get(m_i); + double[] x_i = averageDistances[i]; // y_i double y_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { y_i += x_i[j]; } y_i /= dim; // sigma_i double sigma_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { double diff = x_i[j] - y_i; sigma_i += diff * diff; } sigma_i /= (dim - 1); sigma_i = Math.sqrt(sigma_i); - for (int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, m_i, j)); + for(int j = 0; j < dim; j++) { + z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j)); } } Collections.sort(z_ijs); int max = Math.max(k * l, 2); - for (int m = 0; m < max; m++) { - CTriple<Double, DBID, Integer> z_ij = z_ijs.get(m); - TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); - dims_i.add(z_ij.getThird()); + for(int m = 0; m < max; m++) { + DoubleIntInt z_ij = z_ijs.get(m); + TIntSet dims_i = dimensionMap[z_ij.dimi]; + dims_i.add(z_ij.dimj); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("z_ij ").append(z_ij).append('\n'); @@ -476,64 +475,65 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the set of correlated dimensions for each specified cluster * centroid */ - private List<Pair<V, TIntSet>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) { + private List<Pair<Vector, TIntSet>> findDimensions(ArrayList<PROCLUSCluster> clusters, Relation<V> database) { // compute x_ij = avg distance from points in c_i to c_i.centroid int dim = RelationUtil.dimensionality(database); - Map<Integer, double[]> averageDistances = new HashMap<>(); + final int numc = clusters.size(); + double[][] averageDistances = new double[numc][]; - for (int i = 0; i < clusters.size(); i++) { + for(int i = 0; i < numc; i++) { PROCLUSCluster c_i = clusters.get(i); double[] x_i = new double[dim]; - for (DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] += Math.abs(c_i.centroid.doubleValue(d) - o.doubleValue(d)); } } - for (int d = 0; d < dim; d++) { + for(int d = 0; d < dim; d++) { x_i[d] /= c_i.objectIDs.size(); } - averageDistances.put(i, x_i); + averageDistances[i] = x_i; } - List<CTriple<Double, Integer, Integer>> z_ijs = new ArrayList<>(); - for (int i = 0; i < clusters.size(); i++) { - double[] x_i = averageDistances.get(i); + List<DoubleIntInt> z_ijs = new ArrayList<>(); + for(int i = 0; i < numc; i++) { + double[] x_i = averageDistances[i]; // y_i double y_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { y_i += x_i[j]; } y_i /= dim; // sigma_i double sigma_i = 0; - for (int j = 0; j < dim; j++) { + for(int j = 0; j < dim; j++) { double diff = x_i[j] - y_i; sigma_i += diff * diff; } sigma_i /= (dim - 1); sigma_i = Math.sqrt(sigma_i); - for (int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, i, j)); + for(int j = 0; j < dim; j++) { + z_ijs.add(new DoubleIntInt((x_i[j] - y_i) / sigma_i, i, j)); } } Collections.sort(z_ijs); // mapping cluster index -> dimensions - Map<Integer, TIntSet> dimensionMap = new HashMap<>(); + TIntSet[] dimensionMap = new TIntSet[numc]; int max = Math.max(k * l, 2); - for (int m = 0; m < max; m++) { - CTriple<Double, Integer, Integer> z_ij = z_ijs.get(m); - TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); - if (dims_i == null) { + for(int m = 0; m < max; m++) { + DoubleIntInt z_ij = z_ijs.get(m); + TIntSet dims_i = dimensionMap[z_ij.dimi]; + if(dims_i == null) { dims_i = new TIntHashSet(); - dimensionMap.put(z_ij.getSecond(), dims_i); + dimensionMap[z_ij.dimi] = dims_i; } - dims_i.add(z_ij.getThird()); + dims_i.add(z_ij.dimj); - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("z_ij ").append(z_ij).append('\n'); @@ -543,9 +543,12 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } // mapping cluster -> dimensions - List<Pair<V, TIntSet>> result = new ArrayList<>(); - for (int i : dimensionMap.keySet()) { - TIntSet dims_i = dimensionMap.get(i); + List<Pair<Vector, TIntSet>> result = new ArrayList<>(); + for(int i = 0; i < numc; i++) { + TIntSet dims_i = dimensionMap[i]; + if(dims_i == null) { + continue; + } PROCLUSCluster c_i = clusters.get(i); result.add(new Pair<>(c_i.centroid, dims_i)); } @@ -555,45 +558,51 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster /** * Assigns the objects to the clusters. * + * @param m_current Current centers * @param dimensions set of correlated dimensions for each medoid of the * cluster * @param database the database containing the objects * @return the assignments of the object to the clusters */ - private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, TIntSet> dimensions, Relation<V> database) { - Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<>(); - for (DBID m_i : dimensions.keySet()) { - clusterIDs.put(m_i, DBIDUtil.newHashSet()); - } - - for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { - DBID p_id = DBIDUtil.deref(it); - V p = database.get(p_id); - DistanceDBIDPair<DoubleDistance> minDist = null; - for (DBID m_i : dimensions.keySet()) { + private ArrayList<PROCLUSCluster> assignPoints(ArrayDBIDs m_current, TIntSet[] dimensions, Relation<V> database) { + ModifiableDBIDs[] clusterIDs = new ModifiableDBIDs[dimensions.length]; + for(int i = 0; i < m_current.size(); i++) { + clusterIDs[i] = DBIDUtil.newHashSet(); + } + + DBIDArrayIter m_i = m_current.iter(); + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V p = database.get(it); + double minDist = Double.NaN; + int best = -1, i = 0; + for(m_i.seek(0); m_i.valid(); m_i.advance(), i++) { V m = database.get(m_i); - DistanceDBIDPair<DoubleDistance> currentDist = DBIDUtil.newDistancePair(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i); - if (minDist == null || currentDist.compareByDistance(minDist) < 0) { + double currentDist = manhattanSegmentalDistance(p, m, dimensions[i]); + if(!(minDist <= currentDist)) { minDist = currentDist; + best = i; } } // add p to cluster with mindist - assert minDist != null; - ModifiableDBIDs ids = clusterIDs.get(DBIDUtil.deref(minDist)); - ids.add(p_id); + assert best >= 0; + ModifiableDBIDs ids = clusterIDs[best]; + ids.add(it); } - Map<DBID, PROCLUSCluster> clusters = new HashMap<>(); - for (DBID m_i : dimensions.keySet()) { - ModifiableDBIDs objectIDs = clusterIDs.get(m_i); - if (!objectIDs.isEmpty()) { - TIntSet clusterDimensions = dimensions.get(m_i); - V centroid = Centroid.make(database, objectIDs).toVector(database); - clusters.put(m_i, new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); + ArrayList<PROCLUSCluster> clusters = new ArrayList<>(m_current.size()); + for(int i = 0; i < dimensions.length; i++) { + ModifiableDBIDs objectIDs = clusterIDs[i]; + if(!objectIDs.isEmpty()) { + TIntSet clusterDimensions = dimensions[i]; + Vector centroid = Centroid.make(database, objectIDs); + clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); + } + else { + clusters.add(null); } } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("clusters ").append(clusters).append('\n'); @@ -610,42 +619,43 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param database the database containing the objects * @return the assignments of the object to the clusters */ - private List<PROCLUSCluster> finalAssignment(List<Pair<V, TIntSet>> dimensions, Relation<V> database) { + private List<PROCLUSCluster> finalAssignment(List<Pair<Vector, TIntSet>> dimensions, Relation<V> database) { Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>(); - for (int i = 0; i < dimensions.size(); i++) { + for(int i = 0; i < dimensions.size(); i++) { clusterIDs.put(i, DBIDUtil.newHashSet()); } - for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { - DBID p_id = DBIDUtil.deref(it); - V p = database.get(p_id); - Pair<DoubleDistance, Integer> minDist = null; - for (int i = 0; i < dimensions.size(); i++) { - Pair<V, TIntSet> pair_i = dimensions.get(i); - V c_i = pair_i.first; + for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + V p = database.get(it); + double minDist = Double.POSITIVE_INFINITY; + int best = -1; + for(int i = 0; i < dimensions.size(); i++) { + Pair<Vector, TIntSet> pair_i = dimensions.get(i); + Vector c_i = pair_i.first; TIntSet dimensions_i = pair_i.second; - DoubleDistance currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i); - if (minDist == null || currentDist.compareTo(minDist.first) < 0) { - minDist = new Pair<>(currentDist, i); + double currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i); + if(best < 0 || currentDist < minDist) { + minDist = currentDist; + best = i; } } // add p to cluster with mindist - assert minDist != null; - ModifiableDBIDs ids = clusterIDs.get(minDist.second); - ids.add(p_id); + assert minDist >= 0.; + ModifiableDBIDs ids = clusterIDs.get(best); + ids.add(it); } List<PROCLUSCluster> clusters = new ArrayList<>(); - for (int i = 0; i < dimensions.size(); i++) { + for(int i = 0; i < dimensions.size(); i++) { ModifiableDBIDs objectIDs = clusterIDs.get(i); - if (!objectIDs.isEmpty()) { + if(!objectIDs.isEmpty()) { TIntSet clusterDimensions = dimensions.get(i).second; - V centroid = Centroid.make(database, objectIDs).toVector(database); + Vector centroid = Centroid.make(database, objectIDs); clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); } } - if (LOG.isDebugging()) { + if(LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("clusters ").append(clusters).append('\n'); @@ -664,14 +674,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the Manhattan segmental distance between o1 and o2 relative to the * specified dimensions */ - private DoubleDistance manhattanSegmentalDistance(V o1, V o2, TIntSet dimensions) { + private double manhattanSegmentalDistance(NumberVector o1, NumberVector o2, TIntSet dimensions) { double result = 0; - for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { final int d = iter.next(); result += Math.abs(o1.doubleValue(d) - o2.doubleValue(d)); } result /= dimensions.size(); - return new DoubleDistance(result); + return result; } /** @@ -682,20 +692,20 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param database the database holding the objects * @return a measure for the cluster quality */ - private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, TIntSet> dimensions, Relation<V> database) { + private double evaluateClusters(ArrayList<PROCLUSCluster> clusters, TIntSet[] dimensions, Relation<V> database) { double result = 0; - for (DBID m_i : clusters.keySet()) { - PROCLUSCluster c_i = clusters.get(m_i); - V centroid_i = c_i.centroid; + for(int i = 0; i < dimensions.length; i++) { + PROCLUSCluster c_i = clusters.get(i); + Vector centroid_i = c_i.centroid; - TIntSet dims_i = dimensions.get(m_i); + TIntSet dims_i = dimensions[i]; double w_i = 0; - for (TIntIterator iter = dims_i.iterator(); iter.hasNext();) { + for(TIntIterator iter = dims_i.iterator(); iter.hasNext();) { final int j = iter.next(); w_i += avgDistance(centroid_i, c_i.objectIDs, database, j); } - w_i /= dimensions.keySet().size(); + w_i /= dimensions.length; result += c_i.objectIDs.size() * w_i; } @@ -713,9 +723,9 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the average distance of the objects to the centroid along the * specified dimension */ - private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) { + private double avgDistance(Vector centroid, DBIDs objectIDs, Relation<V> database, int dimension) { Mean avg = new Mean(); - for (DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { + for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); avg.put(Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension))); } @@ -726,16 +736,18 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * Computes the bad medoids, where the medoid of a cluster with less than the * specified threshold of objects is bad. * + * @param m_current Current medoids * @param clusters the clusters * @param threshold the threshold * @return the bad medoids */ - private ModifiableDBIDs computeBadMedoids(Map<DBID, PROCLUSCluster> clusters, int threshold) { - ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(); - for (DBID m_i : clusters.keySet()) { - PROCLUSCluster c_i = clusters.get(m_i); - if (c_i.objectIDs.size() < threshold) { - badMedoids.add(m_i); + private DBIDs computeBadMedoids(ArrayDBIDs m_current, ArrayList<PROCLUSCluster> clusters, int threshold) { + ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(m_current.size()); + int i = 0; + for(DBIDIter it = m_current.iter(); it.valid(); it.advance(), i++) { + PROCLUSCluster c_i = clusters.get(i); + if(c_i == null || c_i.objectIDs.size() < threshold) { + badMedoids.add(it); } } return badMedoids; @@ -752,6 +764,36 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } /** + * Simple triple. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private static class DoubleIntInt implements Comparable<DoubleIntInt> { + protected double first; + + protected int dimi, dimj; + + public DoubleIntInt(double first, int second, int third) { + this.first = first; + this.dimi = second; + this.dimj = third; + } + + @Override + public int compareTo(DoubleIntInt o) { + if(this.first < o.first) { + return -1; + } + if(this.first > o.first) { + return +1; + } + return 0; + } + } + + /** * Encapsulates the attributes of a cluster. * * @apiviz.exclude @@ -770,16 +812,16 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster /** * The centroids of this cluster along each dimension. */ - V centroid; + Vector centroid; /** - * Provides a new cluster with the specified parameters. + * Constructor. * * @param objectIDs the ids of the objects belonging to this cluster * @param dimensions the correlated dimensions of this cluster * @param centroid the centroid of this cluster */ - public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, V centroid) { + public PROCLUSCluster(ModifiableDBIDs objectIDs, TIntSet dimensions, Vector centroid) { this.objectIDs = objectIDs; this.dimensions = dimensions; this.centroid = centroid; @@ -790,10 +832,11 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster StringBuilder result = new StringBuilder(); result.append("Dimensions: ["); boolean notFirst = false; - for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { - if (notFirst) { + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + if(notFirst) { result.append(','); - } else { + } + else { notFirst = true; } result.append(iter.next()); @@ -809,10 +852,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * * @return the correlated dimensions of this cluster as BitSet */ - public BitSet getDimensions() { - BitSet result = new BitSet(); - for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { - result.set(iter.next()); + public long[] getDimensions() { + int maxdim = 0; + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + int d = iter.next(); + maxdim = (d > maxdim) ? d : maxdim; + } + long[] result = BitsUtil.zero(maxdim); + for(TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + BitsUtil.setI(result, iter.next()); } return result; } @@ -825,14 +873,32 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractProjectedClustering.Parameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractProjectedClustering.Parameterizer { + /** + * Parameter to specify the multiplier for the initial number of medoids, + * must be an integer greater than 0. + * <p> + * Default value: {@code 10} + * </p> + * <p> + * Key: {@code -proclus.mi} + * </p> + */ + public static final OptionID M_I_ID = new OptionID("proclus.mi", "The multiplier for the initial number of medoids."); + /** * Parameter to specify the random generator seed. */ public static final OptionID SEED_ID = new OptionID("proclus.seed", "The random number generator seed."); + /** + * Multiplier for the initial number of medoids. + */ protected int m_i = -1; + /** + * Random generator + */ protected RandomFactory rnd; @Override @@ -843,14 +909,14 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster configKI(config); configL(config); - IntParameter m_iP = new IntParameter(M_I_ID, 10); - m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); - if (config.grab(m_iP)) { + IntParameter m_iP = new IntParameter(M_I_ID, 10) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(m_iP)) { m_i = m_iP.getValue(); } RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } |