diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering')
87 files changed, 4579 insertions, 1783 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java index 05cc2b4f..0c4eb5fc 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java index f8b73f48..ee3b234c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -41,14 +41,14 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultIter; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; @@ -179,7 +179,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext public Clustering<Model> run(Database database, Relation<V> relation) { FiniteProgress objprog = getLogger().isVerbose() ? new FiniteProgress("Processing objects", relation.size(), getLogger()) : null; IndefiniteProgress clusprog = getLogger().isVerbose() ? new IndefiniteProgress("Number of clusters", getLogger()) : null; - resultList = new ArrayList<ModifiableDBIDs>(); + resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(relation.size()); @@ -215,14 +215,14 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext clusprog.setProcessed(resultList.size(), getLogger()); } - Clustering<Model> result = new Clustering<Model>(getLongResultName(), getShortResultName()); + Clustering<Model> result = new Clustering<>(getLongResultName(), getShortResultName()); for(Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext();) { Cluster<Model> c = new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER); - result.addCluster(c); + result.addToplevelCluster(c); } Cluster<Model> n = new Cluster<Model>(noise, true, ClusterModel.CLUSTER); - result.addCluster(n); + result.addToplevelCluster(n); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), getLogger()); @@ -279,7 +279,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } // compute weighted epsilon neighborhood - DistanceDBIDResult<DoubleDistance> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); + DistanceDBIDList<DoubleDistance> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); // neighbors < minPts -> noise if(neighbors.size() < minpts) { noise.add(startObjectID); @@ -294,7 +294,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); ModifiableDBIDs seeds = DBIDUtil.newHashSet(); - for (DistanceDBIDResultIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) { + for (DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) { int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension(); // nextID is not reachable from start object if(nextID_corrDim > lambda) { @@ -320,11 +320,11 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext continue; } - DistanceDBIDResult<DoubleDistance> reachables = rangeQuery.getRangeForDBID(iter, epsilon); + DistanceDBIDList<DoubleDistance> reachables = rangeQuery.getRangeForDBID(iter, epsilon); iter.remove(); if(reachables.size() > minpts) { - for (DistanceDBIDResultIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) { + for (DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) { int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension(); // r is not reachable from q if(corrDim_r > lambda) { @@ -395,7 +395,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext protected Integer lambda; protected void configInnerDistance(Parameterization config) { - ObjectParameter<DistanceFunction<V, D>> innerdistP = new ObjectParameter<DistanceFunction<V, D>>(AbstractProjectedDBSCAN.INNER_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + ObjectParameter<DistanceFunction<V, D>> innerdistP = new ObjectParameter<>(AbstractProjectedDBSCAN.INNER_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); if(config.grab(innerdistP)) { innerdist = innerdistP.instantiateClass(config); } @@ -403,7 +403,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext protected void configEpsilon(Parameterization config, DistanceFunction<V, D> innerdist) { D distanceParser = innerdist != null ? innerdist.getDistanceFactory() : null; - DistanceParameter<D> epsilonP = new DistanceParameter<D>(EPSILON_ID, distanceParser); + DistanceParameter<D> epsilonP = new DistanceParameter<>(EPSILON_ID, distanceParser); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } @@ -418,7 +418,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext } protected void configOuterDistance(Parameterization config, D epsilon, int minpts, Class<?> preprocessorClass, DistanceFunction<V, D> innerdist) { - ObjectParameter<LocallyWeightedDistanceFunction<V>> outerdistP = new ObjectParameter<LocallyWeightedDistanceFunction<V>>(OUTER_DISTANCE_FUNCTION_ID, LocallyWeightedDistanceFunction.class, LocallyWeightedDistanceFunction.class); + ObjectParameter<LocallyWeightedDistanceFunction<V>> outerdistP = new ObjectParameter<>(OUTER_DISTANCE_FUNCTION_ID, LocallyWeightedDistanceFunction.class, LocallyWeightedDistanceFunction.class); if(config.grab(outerdistP)) { // parameters for the distance function ListParameterization distanceFunctionParameters = new ListParameterization(); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/CanopyPreClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/CanopyPreClustering.java new file mode 100644 index 00000000..2dff7554 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/CanopyPreClustering.java @@ -0,0 +1,236 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering; +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.WrongParameterValueException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; + +/** + * Canopy pre-clustering is a simple preprocessing step for clustering. + * + * <p> + * Reference:<br> + * A. McCallum, K. Nigam, L.H. Ungar<br /> + * Efficient Clustering of High Dimensional Data Sets with Application to + * Reference Matching<br /> + * Proc. 6th ACM SIGKDD international conference on Knowledge discovery and data + * mining + * </p> + * + * @author Erich Schubert + * + * @param <O> Object type + * @param <D> Distance type + */ +@Reference(authors = "A. McCallum, K. Nigam, L.H. Ungar", title = "Efficient Clustering of High Dimensional Data Sets with Application to Reference Matching", booktitle = "Proc. 6th ACM SIGKDD international conference on Knowledge discovery and data mining", url = "http://dx.doi.org/10.1145%2F347090.347123") +public class CanopyPreClustering<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<O, D, Clustering<ClusterModel>> implements ClusteringAlgorithm<Clustering<ClusterModel>> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(CanopyPreClustering.class); + + /** + * Threshold for inclusion + */ + private D t1; + + /** + * Threshold for removal + */ + private D t2; + + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param t1 Inclusion threshold + * @param t2 Exclusion threshold + */ + public CanopyPreClustering(DistanceFunction<? super O, D> distanceFunction, D t1, D t2) { + super(distanceFunction); + this.t1 = t1; + this.t2 = t2; + } + + /** + * Run the algorithm + * + * @param database Database + * @param relation Relation to process + */ + public Clustering<ClusterModel> run(Database database, Relation<O> relation) { + DistanceQuery<O, D> dq = database.getDistanceQuery(relation, getDistanceFunction()); + ModifiableDBIDs ids = DBIDUtil.newHashSet(relation.getDBIDs()); + ArrayList<Cluster<ClusterModel>> clusters = new ArrayList<>(); + final int size = relation.size(); + + if(t1.compareTo(t2) <= 0) { + LOG.warning(Parameterizer.T1_ID.getName() + " must be larger than " + Parameterizer.T2_ID.getName()); + } + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Canopy clustering", size, LOG) : null; + + DBIDVar first = DBIDUtil.newVar(); + while(!ids.isEmpty()) { + // Remove first element: + DBIDMIter iter = ids.iter(); + first.set(iter); + iter.remove(); + iter.advance(); + + // Start a new cluster: + ModifiableDBIDs cids = DBIDUtil.newArray(); + cids.add(first); + + // Compare to remaining objects: + for(; iter.valid(); iter.advance()) { + D dist = dq.distance(first, iter); + // Inclusion threshold: + if(t1.compareTo(dist) >= 0) { + cids.add(iter); + } + // Removal threshold: + if(t2.compareTo(dist) >= 0) { + iter.remove(); + } + } + // TODO: remember the central object using a CanopyModel? + // Construct cluster: + clusters.add(new Cluster<>(cids, ClusterModel.CLUSTER)); + + if(prog != null) { + prog.setProcessed(size - ids.size(), LOG); + } + } + if(prog != null) { + prog.ensureCompleted(LOG); + } + + return new Clustering<>("Canopy clustering", "canopy-clustering", clusters); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> Object type + * @param <D> Distance type + */ + public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Parameter for the inclusion threshold of canopy clustering. + * + * Note: t1 > t2 + * + * Syntax: + * + * <pre> + * -canopy.t1 <value> + * </pre> + */ + public static final OptionID T1_ID = new OptionID("canopy.t1", "Inclusion threshold for canopy clustering. t1 > t2!"); + + /** + * Parameter for the removal threshold of canopy clustering. + * + * Note: t1 > t2 + * + * Syntax: + * + * <pre> + * -canopy.t2 <value> + * </pre> + */ + public static final OptionID T2_ID = new OptionID("canopy.t2", "Removal threshold for canopy clustering. t1 > t2!"); + + /** + * Threshold for inclusion + */ + private D t1; + + /** + * Threshold for removal + */ + private D t2; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + DistanceParameter<D> t1P = new DistanceParameter<>(T1_ID, distanceFunction); + if(config.grab(t1P)) { + t1 = t1P.getValue(); + } + + DistanceParameter<D> t2P = new DistanceParameter<>(T2_ID, distanceFunction); + // TODO: add distance constraint t1 > t2 + if(config.grab(t2P)) { + t2 = t2P.getValue(); + if(t1.compareTo(t2) <= 0) { + config.reportError(new WrongParameterValueException(t2P, T1_ID.getName() + " must be larger than " + T2_ID.getName())); + } + } + } + + @Override + protected CanopyPreClustering<O, D> makeInstance() { + return new CanopyPreClustering<>(distanceFunction, t1, t2); + } + + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java index 8f637460..249dc313 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/ClusteringAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.database.Database; * * @apiviz.has Clustering * @apiviz.has Model + * @apiviz.excludeSubtypes * * @param <C> Clustering type */ diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java index fcf81faa..57dcb435 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -40,10 +40,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; @@ -140,7 +140,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; - resultList = new ArrayList<ModifiableDBIDs>(); + resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(size); if(size < minpts) { @@ -170,14 +170,14 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor clusprog.setCompleted(LOG); } - Clustering<Model> result = new Clustering<Model>("DBSCAN Clustering", "dbscan-clustering"); + Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering"); for(ModifiableDBIDs res : resultList) { Cluster<Model> c = new Cluster<Model>(res, ClusterModel.CLUSTER); - result.addCluster(c); + result.addToplevelCluster(c); } Cluster<Model> n = new Cluster<Model>(noise, true, ClusterModel.CLUSTER); - result.addCluster(n); + result.addToplevelCluster(n); return result; } @@ -193,7 +193,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * @param objprog the progress object for logging the current status */ protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { - DistanceDBIDResult<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); + DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon); // startObject is no core-object if(neighbors.size() < minpts) { @@ -224,7 +224,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor while(seeds.size() > 0) { DBIDMIter o = seeds.iter(); - DistanceDBIDResult<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon); + DistanceDBIDList<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon); o.remove(); if(neighborhood.size() >= minpts) { @@ -289,7 +289,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - DistanceParameter<D> epsilonP = new DistanceParameter<D>(EPSILON_ID, distanceFunction); + DistanceParameter<D> epsilonP = new DistanceParameter<>(EPSILON_ID, distanceFunction); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } @@ -303,7 +303,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor @Override protected DBSCAN<O, D> makeInstance() { - return new DBSCAN<O, D>(distanceFunction, epsilon, minpts); + return new DBSCAN<>(distanceFunction, epsilon, minpts); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java index 22875715..3c2e0278 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,9 +23,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.set.TIntSet; + import java.util.Collection; import java.util.List; -import java.util.Set; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.KNNJoin; @@ -37,11 +38,11 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.KNNResult; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.index.tree.LeafEntry; import de.lmu.ifi.dbs.elki.index.tree.TreeIndexPathComponent; @@ -119,14 +120,14 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends */ public DeLiClu(DistanceFunction<? super NV, D> distanceFunction, int minpts) { super(distanceFunction); - this.knnJoin = new KNNJoin<NV, D, DeLiCluNode, DeLiCluEntry>(distanceFunction, minpts); + this.knnJoin = new KNNJoin<>(distanceFunction, minpts); this.minpts = minpts; } public ClusterOrderResult<D> run(Database database, Relation<NV> relation) { Collection<DeLiCluTreeIndex<NV>> indexes = ResultUtil.filterResults(database, DeLiCluTreeIndex.class); if(indexes.size() != 1) { - throw new AbortException("DeLiClu found " + indexes.size() + " DeLiCluTree indexes, expected exactly one."); + throw new AbortException("DeLiClu found " + indexes.size() + " DeLiCluTree indexes. DeLiClu needs a special index to operate, therefore you need to add this index to your database."); } DeLiCluTreeIndex<NV> index = indexes.iterator().next(); // FIXME: check that the index matches the relation! @@ -141,13 +142,13 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends if(LOG.isVerbose()) { LOG.verbose("knnJoin..."); } - DataStore<KNNResult<D>> knns = knnJoin.run(database, relation); + DataStore<KNNList<D>> knns = knnJoin.run(database, relation); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("DeLiClu", relation.size(), LOG) : null; final int size = relation.size(); - ClusterOrderResult<D> clusterOrder = new ClusterOrderResult<D>("DeLiClu Clustering", "deliclu-clustering"); - heap = new UpdatableHeap<SpatialObjectPair>(); + ClusterOrderResult<D> clusterOrder = new ClusterOrderResult<>("DeLiClu Clustering", "deliclu-clustering"); + heap = new UpdatableHeap<>(); // add start object to cluster order and (root, root) to priority queue DBID startID = getStartObject(relation); @@ -217,7 +218,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends * @param nodePair the pair of nodes to be expanded * @param knns the knn list */ - private void expandNodes(DeLiCluTree index, SpatialPrimitiveDistanceFunction<NV, D> distFunction, SpatialObjectPair nodePair, DataStore<KNNResult<D>> knns) { + private void expandNodes(DeLiCluTree index, SpatialPrimitiveDistanceFunction<NV, D> distFunction, SpatialObjectPair nodePair, DataStore<KNNList<D>> knns) { DeLiCluNode node1 = index.getNode(((SpatialDirectoryEntry) nodePair.entry1).getPageID()); DeLiCluNode node2 = index.getNode(((SpatialDirectoryEntry) nodePair.entry2).getPageID()); @@ -274,7 +275,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends * @param node2 the second node * @param knns the knn list */ - private void expandLeafNodes(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluNode node1, DeLiCluNode node2, DataStore<KNNResult<D>> knns) { + private void expandLeafNodes(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluNode node1, DeLiCluNode node2, DataStore<KNNList<D>> knns) { if(LOG.isDebuggingFinest()) { LOG.debugFinest("ExpandLeafNodes: " + node1.getPageID() + " + " + node2.getPageID()); } @@ -310,12 +311,12 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends * @param path the path of the object inserted last * @param knns the knn list */ - private void reinsertExpanded(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluTree index, List<TreeIndexPathComponent<DeLiCluEntry>> path, DataStore<KNNResult<D>> knns) { + private void reinsertExpanded(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluTree index, List<TreeIndexPathComponent<DeLiCluEntry>> path, DataStore<KNNList<D>> knns) { SpatialDirectoryEntry rootEntry = (SpatialDirectoryEntry) path.remove(0).getEntry(); reinsertExpanded(distFunction, index, path, 0, rootEntry, knns); } - private void reinsertExpanded(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluTree index, List<TreeIndexPathComponent<DeLiCluEntry>> path, int pos, SpatialDirectoryEntry parentEntry, DataStore<KNNResult<D>> knns) { + private void reinsertExpanded(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluTree index, List<TreeIndexPathComponent<DeLiCluEntry>> path, int pos, SpatialDirectoryEntry parentEntry, DataStore<KNNList<D>> knns) { DeLiCluNode parentNode = index.getNode(parentEntry.getPageID()); SpatialEntry entry2 = path.get(pos).getEntry(); @@ -332,7 +333,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends } } else { - Set<Integer> expanded = index.getExpanded(entry2); + TIntSet expanded = index.getExpanded(entry2); for(int i = 0; i < parentNode.getNumEntries(); i++) { SpatialDirectoryEntry entry1 = (SpatialDirectoryEntry) parentNode.getEntry(i); @@ -503,7 +504,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends @Override protected DeLiClu<NV, D> makeInstance() { - return new DeLiClu<NV, D>(distanceFunction, minpts); + return new DeLiClu<>(distanceFunction, minpts); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java index 514e63bd..c66442a1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -46,7 +46,7 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; @@ -179,13 +179,13 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< if (LOG.isVerbose()) { LOG.verbose("initializing " + k + " models"); } - List<Vector> means = new ArrayList<Vector>(); - for (NumberVector<?> nv : initializer.chooseInitialMeans(relation, k, EuclideanDistanceFunction.STATIC)) { + List<Vector> means = new ArrayList<>(); + for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC)) { means.add(nv.getColumnVector()); } - List<Matrix> covarianceMatrices = new ArrayList<Matrix>(k); + List<Matrix> covarianceMatrices = new ArrayList<>(k); double[] normDistrFactor = new double[k]; - List<Matrix> invCovMatr = new ArrayList<Matrix>(k); + List<Matrix> invCovMatr = new ArrayList<>(k); double[] clusterWeights = new double[k]; probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class); @@ -193,7 +193,13 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< for (int i = 0; i < k; i++) { Matrix m = Matrix.identity(dimensionality, dimensionality); covarianceMatrices.add(m); - normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * m.det()); + final double det = m.det(); + if (det > 0.) { + normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det); + } else { + LOG.warning("Encountered matrix with 0 determinant - degenerated."); + normDistrFactor[i] = 1.0; // Not really well defined + } invCovMatr.add(m.inverse()); clusterWeights[i] = 1.0 / k; if (LOG.isDebuggingFinest()) { @@ -201,7 +207,7 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< msg.append(" model ").append(i).append(":\n"); msg.append(" mean: ").append(means.get(i)).append('\n'); msg.append(" m:\n").append(FormatUtil.format(m, " ")).append('\n'); - msg.append(" m.det(): ").append(m.det()).append('\n'); + msg.append(" m.det(): ").append(det).append('\n'); msg.append(" cluster weight: ").append(clusterWeights[i]).append('\n'); msg.append(" normDistFact: ").append(normDistrFactor[i]).append('\n'); LOG.debugFine(msg.toString()); @@ -222,7 +228,7 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< em = emNew; // recompute models - List<Vector> meanSums = new ArrayList<Vector>(k); + List<Vector> meanSums = new ArrayList<>(k); double[] sumOfClusterProbabilities = new double[k]; for (int i = 0; i < k; i++) { @@ -260,7 +266,13 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT)); } for (int i = 0; i < k; i++) { - normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * covarianceMatrices.get(i).det()); + final double det = covarianceMatrices.get(i).det(); + if (det > 0.) { + normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det); + } else { + LOG.warning("Encountered matrix with 0 determinant - degenerated."); + normDistrFactor[i] = 1.0; // Not really well defined + } invCovMatr.set(i, covarianceMatrices.get(i).inverse()); } // reassign probabilities @@ -279,7 +291,7 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< } // fill result with clusters and models - List<ModifiableDBIDs> hardClusters = new ArrayList<ModifiableDBIDs>(k); + List<ModifiableDBIDs> hardClusters = new ArrayList<>(k); for (int i = 0; i < k; i++) { hardClusters.add(DBIDUtil.newHashSet()); } @@ -298,14 +310,14 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< hardClusters.get(maxIndex).add(iditer); } final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<EMModel<V>> result = new Clustering<EMModel<V>>("EM Clustering", "em-clustering"); + Clustering<EMModel<V>> result = new Clustering<>("EM Clustering", "em-clustering"); // provide models within the result for (int i = 0; i < k; i++) { // TODO: re-do labeling. // SimpleClassLabel label = new SimpleClassLabel(); // label.init(result.canonicalClusterLabel(i)); - Cluster<EMModel<V>> model = new Cluster<EMModel<V>>(hardClusters.get(i), new EMModel<V>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i))); - result.addCluster(model); + Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i))); + result.addToplevelCluster(model); } return result; } @@ -339,6 +351,9 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< if (LOG.isDebuggingFinest()) { LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " ")); } + if (!(prob >= 0.)) { + LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor[i]); + } probabilities[i] = prob; } double priorProbability = 0.0; @@ -352,13 +367,12 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< double[] clusterProbabilities = new double[k]; for (int i = 0; i < k; i++) { - assert (priorProbability >= 0.0); assert (clusterWeights[i] >= 0.0); // do not divide by zero! - if (priorProbability == 0.0) { - clusterProbabilities[i] = 0.0; - } else { + if (priorProbability > 0.0) { clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i]; + } else { + clusterProbabilities[i] = 0.0; } } probClusterIGivenX.put(iditer, clusterProbabilities); @@ -412,7 +426,7 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< k = kP.getValue(); } - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } @@ -433,7 +447,7 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering< @Override protected EM<V> makeInstance() { - return new EM<V>(k, delta, initializer, maxiter); + return new EM<>(k, delta, initializer, maxiter); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/NaiveMeanShiftClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/NaiveMeanShiftClustering.java index 8429d8ac..a4d6e307 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/NaiveMeanShiftClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/NaiveMeanShiftClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -36,19 +36,19 @@ import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultIter; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; -import de.lmu.ifi.dbs.elki.math.statistics.EpanechnikovKernelDensityFunction; -import de.lmu.ifi.dbs.elki.math.statistics.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; @@ -135,7 +135,7 @@ public class NaiveMeanShiftClustering<V extends NumberVector<?>, D extends Numbe final double threshold = bandwidth * 1E-10; // Result store: - ArrayList<Pair<V, ModifiableDBIDs>> clusters = new ArrayList<Pair<V, ModifiableDBIDs>>(); + ArrayList<Pair<V, ModifiableDBIDs>> clusters = new ArrayList<>(); ModifiableDBIDs noise = DBIDUtil.newArray(); @@ -148,11 +148,11 @@ public class NaiveMeanShiftClustering<V extends NumberVector<?>, D extends Numbe // Compute new position: V newvec = null; { - DistanceDBIDResult<D> neigh = rangeq.getRangeForObject(position, range); + DistanceDBIDList<D> neigh = rangeq.getRangeForObject(position, range); boolean okay = (neigh.size() > 1) || (neigh.size() >= 1 && j > 1); if (okay) { Centroid newpos = new Centroid(dim); - for (DistanceDBIDResultIter<D> niter = neigh.iter(); niter.valid(); niter.advance()) { + for (DistanceDBIDListIter<D> niter = neigh.iter(); niter.valid(); niter.advance()) { final double weight = kernel.density(niter.getDistance().doubleValue() / bandwidth); newpos.put(relation.get(niter), weight); } @@ -206,14 +206,14 @@ public class NaiveMeanShiftClustering<V extends NumberVector<?>, D extends Numbe prog.ensureCompleted(LOG); } - ArrayList<Cluster<MeanModel<V>>> cs = new ArrayList<Cluster<MeanModel<V>>>(clusters.size()); + ArrayList<Cluster<MeanModel<V>>> cs = new ArrayList<>(clusters.size()); for (Pair<V, ModifiableDBIDs> pair : clusters) { - cs.add(new Cluster<MeanModel<V>>(pair.second, new MeanModel<V>(pair.first))); + cs.add(new Cluster<>(pair.second, new MeanModel<>(pair.first))); } if (noise.size() > 0) { cs.add(new Cluster<MeanModel<V>>(noise, true)); } - Clustering<MeanModel<V>> c = new Clustering<MeanModel<V>>("Mean-shift Clustering", "mean-shift-clustering", cs); + Clustering<MeanModel<V>> c = new Clustering<>("Mean-shift Clustering", "mean-shift-clustering", cs); return c; } @@ -261,11 +261,11 @@ public class NaiveMeanShiftClustering<V extends NumberVector<?>, D extends Numbe @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<KernelDensityFunction>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class); + ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class); if (config.grab(kernelP)) { kernel = kernelP.instantiateClass(config); } - DistanceParameter<D> rangeP = new DistanceParameter<D>(RANGE_ID, distanceFunction); + DistanceParameter<D> rangeP = new DistanceParameter<>(RANGE_ID, distanceFunction); if (config.grab(rangeP)) { range = rangeP.getValue(); } @@ -273,7 +273,7 @@ public class NaiveMeanShiftClustering<V extends NumberVector<?>, D extends Numbe @Override protected NaiveMeanShiftClustering<V, D> makeInstance() { - return new NaiveMeanShiftClustering<V, D>(distanceFunction, kernel, range); + return new NaiveMeanShiftClustering<>(distanceFunction, kernel, range); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java index 2c098dc0..e928d041 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -31,17 +31,17 @@ import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DistanceDBIDPair; -import de.lmu.ifi.dbs.elki.database.ids.DoubleDistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultIter; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DoubleDistanceDBIDResultIter; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -139,7 +139,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OPTICS", size, LOG) : null; processedIDs = DBIDUtil.newHashSet(size); - ClusterOrderResult<D> clusterOrder = new ClusterOrderResult<D>("OPTICS Clusterorder", "optics-clusterorder"); + ClusterOrderResult<D> clusterOrder = new ClusterOrderResult<>("OPTICS Clusterorder", "optics-clusterorder"); if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && DoubleDistance.class.isInstance(epsilon)) { // Optimized codepath for double-based distances. Avoids Java @@ -182,25 +182,25 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * the algorithm */ protected void expandClusterOrder(ClusterOrderResult<D> clusterOrder, Database database, RangeQuery<O, D> rangeQuery, DBID objectID, D epsilon, FiniteProgress progress) { - UpdatableHeap<ClusterOrderEntry<D>> heap = new UpdatableHeap<ClusterOrderEntry<D>>(); - heap.add(new GenericClusterOrderEntry<D>(objectID, null, getDistanceFunction().getDistanceFactory().infiniteDistance())); + UpdatableHeap<ClusterOrderEntry<D>> heap = new UpdatableHeap<>(); + heap.add(new GenericClusterOrderEntry<>(objectID, null, getDistanceFunction().getDistanceFactory().infiniteDistance())); while(!heap.isEmpty()) { final ClusterOrderEntry<D> current = heap.poll(); clusterOrder.add(current); processedIDs.add(current.getID()); - DistanceDBIDResult<D> neighbors = rangeQuery.getRangeForDBID(current.getID(), epsilon); + DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(current.getID(), epsilon); if(neighbors.size() >= minpts) { final DistanceDBIDPair<D> last = neighbors.get(minpts - 1); D coreDistance = last.getDistance(); - for(DistanceDBIDResultIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { if(processedIDs.contains(neighbor)) { continue; } D reachability = DistanceUtil.max(neighbor.getDistance(), coreDistance); - heap.add(new GenericClusterOrderEntry<D>(DBIDUtil.deref(neighbor), current.getID(), reachability)); + heap.add(new GenericClusterOrderEntry<>(DBIDUtil.deref(neighbor), current.getID(), reachability)); } } if(progress != null) { @@ -221,7 +221,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor * the algorithm */ protected void expandClusterOrderDouble(ClusterOrderResult<DoubleDistance> clusterOrder, Database database, RangeQuery<O, DoubleDistance> rangeQuery, DBID objectID, DoubleDistance epsilon, FiniteProgress progress) { - UpdatableHeap<DoubleDistanceClusterOrderEntry> heap = new UpdatableHeap<DoubleDistanceClusterOrderEntry>(); + UpdatableHeap<DoubleDistanceClusterOrderEntry> heap = new UpdatableHeap<>(); heap.add(new DoubleDistanceClusterOrderEntry(objectID, null, Double.POSITIVE_INFINITY)); while(!heap.isEmpty()) { @@ -229,17 +229,17 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor clusterOrder.add(current); processedIDs.add(current.getID()); - DistanceDBIDResult<DoubleDistance> neighbors = rangeQuery.getRangeForDBID(current.getID(), epsilon); + DistanceDBIDList<DoubleDistance> neighbors = rangeQuery.getRangeForDBID(current.getID(), epsilon); if(neighbors.size() >= minpts) { final DistanceDBIDPair<DoubleDistance> last = neighbors.get(minpts - 1); if(last instanceof DoubleDistanceDBIDPair) { double coreDistance = ((DoubleDistanceDBIDPair) last).doubleDistance(); - for(DistanceDBIDResultIter<DoubleDistance> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + for(DistanceDBIDListIter<DoubleDistance> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { if(processedIDs.contains(neighbor)) { continue; } - double reachability = Math.max(((DoubleDistanceDBIDResultIter) neighbor).doubleDistance(), coreDistance); + double reachability = Math.max(((DoubleDistanceDBIDListIter) neighbor).doubleDistance(), coreDistance); heap.add(new DoubleDistanceClusterOrderEntry(DBIDUtil.deref(neighbor), current.getID(), reachability)); } } @@ -248,7 +248,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor // Only if we got an optimized result before. double coreDistance = last.getDistance().doubleValue(); - for(DistanceDBIDResultIter<DoubleDistance> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + for(DistanceDBIDListIter<DoubleDistance> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { if(processedIDs.contains(neighbor)) { continue; } @@ -298,7 +298,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - DistanceParameter<D> epsilonP = new DistanceParameter<D>(EPSILON_ID, distanceFunction, true); + DistanceParameter<D> epsilonP = new DistanceParameter<>(EPSILON_ID, distanceFunction, true); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } @@ -312,7 +312,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor @Override protected OPTICS<O, D> makeInstance() { - return new OPTICS<O, D>(distanceFunction, epsilon, minpts); + return new OPTICS<>(distanceFunction, epsilon, minpts); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java index 3ead6f3e..82d7ec88 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSTypeAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java index 39a0ebd6..583d402b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -46,8 +46,6 @@ import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.result.IterableResult; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; -import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyHashmapList; -import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.ModifiableHierarchy; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; @@ -135,13 +133,13 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< double mib = 0.0; // TODO: make it configurable to keep this list; this is mostly useful for // visualization - List<SteepArea> salist = new ArrayList<SteepArea>(); - List<SteepDownArea> sdaset = new ArrayList<SteepDownArea>(); - ModifiableHierarchy<Cluster<OPTICSModel>> hier = new HierarchyHashmapList<Cluster<OPTICSModel>>(); - HashSet<Cluster<OPTICSModel>> curclusters = new HashSet<Cluster<OPTICSModel>>(); + List<SteepArea> salist = new ArrayList<>(); + List<SteepDownArea> sdaset = new ArrayList<>(); + final Clustering<OPTICSModel> clustering = new Clustering<>("OPTICS Xi-Clusters", "optics"); + HashSet<Cluster<OPTICSModel>> curclusters = new HashSet<>(); HashSetModifiableDBIDs unclaimedids = DBIDUtil.newHashSet(relation.getDBIDs()); - SteepScanPosition<N> scan = new SteepScanPosition<N>(clusterOrder); + SteepScanPosition<N> scan = new SteepScanPosition<>(clusterOrder); while(scan.hasNext()) { final int curpos = scan.index; // Update maximum-inbetween @@ -285,7 +283,7 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< LOG.debugFine("Found cluster with " + dbids.size() + " new objects, length " + (cstart - cend + 1)); } OPTICSModel model = new OPTICSModel(cstart, cend); - Cluster<OPTICSModel> cluster = new Cluster<OPTICSModel>("Cluster_" + cstart + "_" + cend, dbids, model, hier); + Cluster<OPTICSModel> cluster = new Cluster<>("Cluster_" + cstart + "_" + cend, dbids, model); // Build the hierarchy { Iterator<Cluster<OPTICSModel>> iter = curclusters.iterator(); @@ -293,7 +291,7 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< Cluster<OPTICSModel> clus = iter.next(); OPTICSModel omodel = clus.getModel(); if(model.getStartIndex() <= omodel.getStartIndex() && omodel.getEndIndex() <= model.getEndIndex()) { - hier.add(cluster, clus); + clustering.addChildCluster(cluster, clus); iter.remove(); } } @@ -308,23 +306,22 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< } } if(curclusters.size() > 0 || unclaimedids.size() > 0) { - final Clustering<OPTICSModel> clustering = new Clustering<OPTICSModel>("OPTICS Xi-Clusters", "optics"); if(unclaimedids.size() > 0) { final Cluster<OPTICSModel> allcluster; if(clusterOrder.get(clusterOrder.size() - 1).getReachability().isInfiniteDistance()) { - allcluster = new Cluster<OPTICSModel>("Noise", unclaimedids, true, new OPTICSModel(0, clusterOrder.size() - 1), hier); + allcluster = new Cluster<>("Noise", unclaimedids, true, new OPTICSModel(0, clusterOrder.size() - 1)); } else { - allcluster = new Cluster<OPTICSModel>("Cluster", unclaimedids, new OPTICSModel(0, clusterOrder.size() - 1), hier); + allcluster = new Cluster<>("Cluster", unclaimedids, new OPTICSModel(0, clusterOrder.size() - 1)); } for(Cluster<OPTICSModel> cluster : curclusters) { - hier.add(allcluster, cluster); + clustering.addChildCluster(allcluster, cluster); } - clustering.addCluster(allcluster); + clustering.addToplevelCluster(allcluster); } else { for(Cluster<OPTICSModel> cluster : curclusters) { - clustering.addCluster(cluster); + clustering.addToplevelCluster(cluster); } } clustering.addChildResult(clusterOrderResult); @@ -663,7 +660,7 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< xi = xiP.doubleValue(); } - ClassParameter<OPTICSTypeAlgorithm<D>> opticsP = new ClassParameter<OPTICSTypeAlgorithm<D>>(XIALG_ID, OPTICSTypeAlgorithm.class, OPTICS.class); + ClassParameter<OPTICSTypeAlgorithm<D>> opticsP = new ClassParameter<>(XIALG_ID, OPTICSTypeAlgorithm.class, OPTICS.class); if(config.grab(opticsP)) { optics = opticsP.instantiateClass(config); } @@ -671,7 +668,7 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm< @Override protected OPTICSXi<D> makeInstance() { - return new OPTICSXi<D>(optics, xi); + return new OPTICSXi<>(optics, xi); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java deleted file mode 100644 index 3e1f0650..00000000 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SLINK.java +++ /dev/null @@ -1,818 +0,0 @@ -package de.lmu.ifi.dbs.elki.algorithm.clustering; - -/* - This file is part of ELKI: - Environment for Developing KDD-Applications Supported by Index-Structures - - Copyright (C) 2012 - Ludwig-Maximilians-Universität München - Lehr- und Forschungseinheit für Datenbanksysteme - ELKI Development Team - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -import gnu.trove.list.array.TDoubleArrayList; - -import java.util.ArrayList; -import java.util.Comparator; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.data.Cluster; -import de.lmu.ifi.dbs.elki.data.Clustering; -import de.lmu.ifi.dbs.elki.data.model.DendrogramModel; -import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeInformation; -import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.database.datastore.DBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.DataStore; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; -import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.DoubleDistanceDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDistanceDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; -import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; -import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; -import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; -import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.DistanceUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.logging.Logging; -import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; -import de.lmu.ifi.dbs.elki.result.BasicResult; -import de.lmu.ifi.dbs.elki.result.OrderingFromDataStore; -import de.lmu.ifi.dbs.elki.result.Result; -import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyHashmapList; -import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.ModifiableHierarchy; -import de.lmu.ifi.dbs.elki.utilities.documentation.Description; -import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; -import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; - -/** - * Implementation of the efficient Single-Link Algorithm SLINK of R. Sibson. - * <p> - * Reference: R. Sibson: SLINK: An optimally efficient algorithm for the - * single-link cluster method. <br> - * In: The Computer Journal 16 (1973), No. 1, p. 30-34. - * </p> - * - * @author Elke Achtert - * @param <O> the type of DatabaseObject the algorithm is applied on - * @param <D> the type of Distance used - */ -@Title("SLINK: Single Link Clustering") -@Description("Hierarchical clustering algorithm based on single-link connectivity.") -@Reference(authors = "R. Sibson", title = "SLINK: An optimally efficient algorithm for the single-link cluster method", booktitle = "The Computer Journal 16 (1973), No. 1, p. 30-34.", url = "http://dx.doi.org/10.1093/comjnl/16.1.30") -public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<O, D, Result> { - /** - * The logger for this class. - */ - private static final Logging LOG = Logging.getLogger(SLINK.class); - - /** - * Minimum number of clusters to extract - */ - private int minclusters = -1; - - /** - * Constructor. - * - * @param distanceFunction Distance function - * @param minclusters Minimum clusters to extract. Can be {@code -1}. - */ - public SLINK(DistanceFunction<? super O, D> distanceFunction, int minclusters) { - super(distanceFunction); - this.minclusters = minclusters; - } - - /** - * Performs the SLINK algorithm on the given database. - */ - public Result run(Database database, Relation<O> relation) { - DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); - @SuppressWarnings("unchecked") - Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass(); - WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); - WritableDataStore<D> lambda = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, distCls); - // Temporary storage for m. - WritableDataStore<D> m = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); - - FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running SLINK", relation.size(), LOG) : null; - // has to be an array for monotonicity reasons! - ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size()); - - // Optimized code path for double distances - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && lambda instanceof WritableDoubleDistanceDataStore && m instanceof WritableDoubleDistanceDataStore) { - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<? super O> dist = (PrimitiveDoubleDistanceFunction<? super O>) getDistanceFunction(); - WritableDoubleDistanceDataStore lambdad = (WritableDoubleDistanceDataStore) lambda; - WritableDoubleDistanceDataStore md = (WritableDoubleDistanceDataStore) m; - // apply the algorithm - for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) { - step1double(id, pi, lambdad); - step2double(id, processedIDs, distQuery.getRelation(), dist, md); - step3double(id, pi, lambdad, processedIDs, md); - step4double(id, pi, lambdad, processedIDs); - - processedIDs.add(id); - - if (progress != null) { - progress.incrementProcessed(LOG); - } - } - } else { - // apply the algorithm - for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) { - step1(id, pi, lambda); - step2(id, processedIDs, distQuery, m); - step3(id, pi, lambda, processedIDs, m); - step4(id, pi, lambda, processedIDs); - - processedIDs.add(id); - - if (progress != null) { - progress.incrementProcessed(LOG); - } - } - } - - if (progress != null) { - progress.ensureCompleted(LOG); - } - // We don't need m anymore. - m.destroy(); - m = null; - - // Build dendrogam clusters identified by their target object - if (LOG.isVerbose()) { - LOG.verbose("Extracting clusters."); - } - final BasicResult result; - if (lambda instanceof DoubleDistanceDataStore) { - result = extractClustersDouble(relation.getDBIDs(), pi, (DoubleDistanceDataStore) lambda, minclusters); - } else { - result = extractClusters(relation.getDBIDs(), pi, lambda, minclusters); - } - - result.addChildResult(new MaterializedRelation<DBID>("SLINK pi", "slink-order", TypeUtil.DBID, pi, processedIDs)); - result.addChildResult(new MaterializedRelation<D>("SLINK lambda", "slink-order", new SimpleTypeInformation<D>(distCls), lambda, processedIDs)); - result.addChildResult(new OrderingFromDataStore<D>("SLINK order", "slink-order", processedIDs, lambda)); - return result; - } - - /** - * First step: Initialize P(id) = id, L(id) = infinity. - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param pi Pi data store - * @param lambda Lambda data store - */ - private void step1(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda) { - // P(n+1) = n+1: - pi.put(id, id); - // L(n+1) = infinity - lambda.put(id, getDistanceFunction().getDistanceFactory().infiniteDistance()); - } - - /** - * Second step: Determine the pairwise distances from all objects in the - * pointer representation to the new object with the specified id. - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param processedIDs the already processed ids - * @param m Data store - * @param distFunc Distance function to use - */ - private void step2(DBIDRef id, DBIDs processedIDs, DistanceQuery<O, D> distFunc, WritableDataStore<D> m) { - O newObj = distFunc.getRelation().get(id); - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - // M(i) = dist(i, n+1) - m.put(it, distFunc.distance(it, newObj)); - } - } - - /** - * Third step: Determine the values for P and L - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param pi Pi data store - * @param lambda Lambda data store - * @param processedIDs the already processed ids - * @param m Data store - */ - private void step3(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda, DBIDs processedIDs, WritableDataStore<D> m) { - DBIDVar p_i = DBIDUtil.newVar(); - // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - D l_i = lambda.get(it); - D m_i = m.get(it); - pi.assignVar(it, p_i); // p_i = pi(it) - D mp_i = m.get(p_i); - - // if L(i) >= M(i) - if (l_i.compareTo(m_i) >= 0) { - // M(P(i)) = min { M(P(i)), L(i) } - m.put(p_i, DistanceUtil.min(mp_i, l_i)); - - // L(i) = M(i) - lambda.put(it, m_i); - - // P(i) = n+1; - pi.put(it, id); - } else { - // M(P(i)) = min { M(P(i)), M(i) } - m.put(p_i, DistanceUtil.min(mp_i, m_i)); - } - } - } - - /** - * Fourth step: Actualize the clusters if necessary - * - * @param id the id of the current object - * @param pi Pi data store - * @param lambda Lambda data store - * @param processedIDs the already processed ids - */ - private void step4(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda, DBIDs processedIDs) { - DBIDVar p_i = DBIDUtil.newVar(); - // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - D l_i = lambda.get(it); - pi.assignVar(it, p_i); // p_i = pi(it) - D lp_i = lambda.get(p_i); - - // if L(i) >= L(P(i)) - if (l_i.compareTo(lp_i) >= 0) { - // P(i) = n+1 - pi.put(it, id); - } - } - } - - /** - * First step: Initialize P(id) = id, L(id) = infinity. - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param pi Pi data store - * @param lambda Lambda data store - */ - private void step1double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda) { - // P(n+1) = n+1: - pi.put(id, id); - // L(n+1) = infinity - lambda.putDouble(id, Double.POSITIVE_INFINITY); - } - - /** - * Second step: Determine the pairwise distances from all objects in the - * pointer representation to the new object with the specified id. - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param processedIDs the already processed ids - * @param m Data store - * @param relation Data relation - * @param distFunc Distance function to use - */ - private void step2double(DBIDRef id, DBIDs processedIDs, Relation<? extends O> relation, PrimitiveDoubleDistanceFunction<? super O> distFunc, WritableDoubleDistanceDataStore m) { - O newObj = relation.get(id); - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - // M(i) = dist(i, n+1) - m.putDouble(it, distFunc.doubleDistance(relation.get(it), newObj)); - } - } - - /** - * Third step: Determine the values for P and L - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param pi Pi data store - * @param lambda Lambda data store - * @param processedIDs the already processed ids - * @param m Data store - */ - private void step3double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda, DBIDs processedIDs, WritableDoubleDistanceDataStore m) { - DBIDVar p_i = DBIDUtil.newVar(); - // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - double l_i = lambda.doubleValue(it); - double m_i = m.doubleValue(it); - pi.assignVar(it, p_i); // p_i = pi(it) - double mp_i = m.doubleValue(p_i); - - // if L(i) >= M(i) - if (l_i >= m_i) { - // M(P(i)) = min { M(P(i)), L(i) } - m.putDouble(p_i, Math.min(mp_i, l_i)); - - // L(i) = M(i) - lambda.putDouble(it, m_i); - - // P(i) = n+1; - pi.put(it, id); - } else { - // M(P(i)) = min { M(P(i)), M(i) } - m.putDouble(p_i, Math.min(mp_i, m_i)); - } - } - } - - /** - * Fourth step: Actualize the clusters if necessary - * - * @param id the id of the current object - * @param pi Pi data store - * @param lambda Lambda data store - * @param processedIDs the already processed ids - */ - private void step4double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda, DBIDs processedIDs) { - DBIDVar p_i = DBIDUtil.newVar(); - // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - double l_i = lambda.doubleValue(it); - pi.assignVar(it, p_i); // p_i = pi(it) - double lp_i = lambda.doubleValue(p_i); - - // if L(i) >= L(P(i)) - if (l_i >= lp_i) { - // P(i) = n+1 - pi.put(it, id); - } - } - } - - /** - * Extract all clusters from the pi-lambda-representation. - * - * @param ids Object ids to process - * @param pi Pi store - * @param lambda Lambda store - * @param minclusters Minimum number of clusters to extract - * - * @return Hierarchical clustering - */ - private Clustering<DendrogramModel<D>> extractClusters(DBIDs ids, final DBIDDataStore pi, final DataStore<D> lambda, int minclusters) { - FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), LOG) : null; - D nulldist = getDistanceFunction().getDistanceFactory().nullDistance(); - - // Sort DBIDs by lambda. We need this for two things: - // a) to determine the stop distance from "minclusters" parameter - // b) to process arrows in decreasing / increasing order - ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); - order.sort(new CompareByLambda<D>(lambda)); - - // Stop distance: - final D stopdist = (minclusters > 0) ? lambda.get(order.get(ids.size() - minclusters)) : null; - - // The initial pass is top-down. - DBIDArrayIter it = order.iter(); - int split = (minclusters > 0) ? Math.max(ids.size() - minclusters, 0) : 0; - // Tie handling: decrement split. - if (stopdist != null) { - while (split > 0) { - it.seek(split - 1); - if (stopdist.compareTo(lambda.get(it)) == 0) { - split--; - minclusters++; - } else { - break; - } - } - } - - // Extract the child clusters - int cnum = 0; - int expcnum = Math.max(0, minclusters); - WritableIntegerDataStore cluster_map = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -1); - ArrayList<ModifiableDBIDs> cluster_dbids = new ArrayList<ModifiableDBIDs>(expcnum); - ArrayList<D> cluster_dist = new ArrayList<D>(expcnum); - ArrayModifiableDBIDs cluster_leads = DBIDUtil.newArray(expcnum); - - DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. - // Go backwards on the lower part. - for (it.seek(split - 1); it.valid(); it.retract()) { - D dist = lambda.get(it); // Distance to successor - pi.assignVar(it, succ); // succ = pi(it) - int clusterid = cluster_map.intValue(succ); - // Successor cluster has already been created: - if (clusterid >= 0) { - cluster_dbids.get(clusterid).add(it); - cluster_map.putInt(it, clusterid); - // Update distance to maximum encountered: - if (cluster_dist.get(clusterid).compareTo(dist) < 0) { - cluster_dist.set(clusterid, dist); - } - } else { - // Need to start a new cluster: - clusterid = cnum; // next cluster number. - ModifiableDBIDs cids = DBIDUtil.newArray(); - // Add element and successor as initial members: - cids.add(succ); - cluster_map.putInt(succ, clusterid); - cids.add(it); - cluster_map.putInt(it, clusterid); - // Store new cluster. - cluster_dbids.add(cids); - cluster_leads.add(succ); - cluster_dist.add(dist); - cnum++; - } - - // Decrement counter - if (progress != null) { - progress.incrementProcessed(LOG); - } - } - // Build a hierarchy out of these clusters. - Cluster<DendrogramModel<D>> root = null; - ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier = new HierarchyHashmapList<Cluster<DendrogramModel<D>>>(); - ArrayList<Cluster<DendrogramModel<D>>> clusters = new ArrayList<Cluster<DendrogramModel<D>>>(ids.size() + expcnum - split); - // Convert initial clusters to cluster objects - { - int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { - clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i), hier)); - } - cluster_dist = null; // Invalidate - cluster_dbids = null; // Invalidate - } - // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { - int clusterid = cluster_map.intValue(it); - // The current cluster: - final Cluster<DendrogramModel<D>> clus; - if (clusterid >= 0) { - clus = clusters.get(clusterid); - } else { - ArrayModifiableDBIDs cids = DBIDUtil.newArray(1); - cids.add(it); - clus = makeCluster(it, nulldist, cids, hier); - // No need to store in clusters: cannot have another incoming pi - // pointer! - } - // The successor to join: - pi.assignVar(it, succ); // succ = pi(it) - if (DBIDUtil.equal(it, succ)) { - assert (root == null); - root = clus; - } else { - // Parent cluster: - int parentid = cluster_map.intValue(succ); - D depth = lambda.get(it); - // Parent cluster exists - merge as a new cluster: - if (parentid >= 0) { - Cluster<DendrogramModel<D>> pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS, hier); - hier.add(pclus, clusters.get(parentid)); - hier.add(pclus, clus); - clusters.set(parentid, pclus); // Replace existing parent cluster - } else { - // Create a new, one-element, parent cluster. - parentid = cnum; - cnum++; - ArrayModifiableDBIDs cids = DBIDUtil.newArray(1); - cids.add(succ); - Cluster<DendrogramModel<D>> pclus = makeCluster(succ, depth, cids, hier); - hier.add(pclus, clus); - assert (clusters.size() == parentid); - clusters.add(pclus); // Remember parent cluster - cluster_map.putInt(succ, parentid); // Reference - } - } - - // Decrement counter - if (progress != null) { - progress.incrementProcessed(LOG); - } - } - - if (progress != null) { - progress.ensureCompleted(LOG); - } - // build hierarchy - final Clustering<DendrogramModel<D>> dendrogram = new Clustering<DendrogramModel<D>>("Single-Link-Dendrogram", "slink-dendrogram"); - dendrogram.addCluster(root); - - return dendrogram; - } - - /** - * Extract all clusters from the pi-lambda-representation. - * - * @param ids Object ids to process - * @param pi Pi store - * @param lambda Lambda store - * @param minclusters Minimum number of clusters to extract - * - * @return Hierarchical clustering - */ - private Clustering<DendrogramModel<D>> extractClustersDouble(DBIDs ids, final DBIDDataStore pi, final DoubleDistanceDataStore lambda, int minclusters) { - FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), LOG) : null; - D nulldist = getDistanceFunction().getDistanceFactory().nullDistance(); - - // Sort DBIDs by lambda. We need this for two things: - // a) to determine the stop distance from "minclusters" parameter - // b) to process arrows in decreasing / increasing order - ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); - order.sort(new CompareByDoubleLambda(lambda)); - - // Stop distance: - final double stopdist = (minclusters > 0) ? lambda.doubleValue(order.get(ids.size() - minclusters)) : Double.POSITIVE_INFINITY; - - // The initial pass is top-down. - DBIDArrayIter it = order.iter(); - int split = (minclusters > 0) ? Math.max(ids.size() - minclusters, 0) : 0; - // Tie handling: decrement split. - if (minclusters > 0) { - while (split > 0) { - it.seek(split - 1); - if (stopdist <= lambda.doubleValue(it)) { - split--; - minclusters++; - } else { - break; - } - } - } - - // Extract the child clusters - int cnum = 0; - int expcnum = Math.max(0, minclusters); - WritableIntegerDataStore cluster_map = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -1); - ArrayList<ModifiableDBIDs> cluster_dbids = new ArrayList<ModifiableDBIDs>(expcnum); - TDoubleArrayList cluster_dist = new TDoubleArrayList(expcnum); - ArrayModifiableDBIDs cluster_leads = DBIDUtil.newArray(expcnum); - - DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. - // Go backwards on the lower part. - for (it.seek(split - 1); it.valid(); it.retract()) { - double dist = lambda.doubleValue(it); // Distance to successor - pi.assignVar(it, succ); // succ = pi(it) - int clusterid = cluster_map.intValue(succ); - // Successor cluster has already been created: - if (clusterid >= 0) { - cluster_dbids.get(clusterid).add(it); - cluster_map.putInt(it, clusterid); - // Update distance to maximum encountered: - if (cluster_dist.get(clusterid) < dist) { - cluster_dist.set(clusterid, dist); - } - } else { - // Need to start a new cluster: - clusterid = cnum; // next cluster number. - ModifiableDBIDs cids = DBIDUtil.newArray(); - // Add element and successor as initial members: - cids.add(succ); - cluster_map.putInt(succ, clusterid); - cids.add(it); - cluster_map.putInt(it, clusterid); - // Store new cluster. - cluster_dbids.add(cids); - cluster_leads.add(succ); - cluster_dist.add(dist); - cnum++; - } - - // Decrement counter - if (progress != null) { - progress.incrementProcessed(LOG); - } - } - // Build a hierarchy out of these clusters. - Cluster<DendrogramModel<D>> root = null; - ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier = new HierarchyHashmapList<Cluster<DendrogramModel<D>>>(); - ArrayList<Cluster<DendrogramModel<D>>> clusters = new ArrayList<Cluster<DendrogramModel<D>>>(ids.size() + expcnum - split); - // Convert initial clusters to cluster objects - { - int i = 0; - for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { - @SuppressWarnings("unchecked") - D depth = (D) new DoubleDistance(cluster_dist.get(i)); - clusters.add(makeCluster(it2, depth, cluster_dbids.get(i), hier)); - } - cluster_dist = null; // Invalidate - cluster_dbids = null; // Invalidate - } - // Process the upper part, bottom-up. - for (it.seek(split); it.valid(); it.advance()) { - int clusterid = cluster_map.intValue(it); - // The current cluster: - final Cluster<DendrogramModel<D>> clus; - if (clusterid >= 0) { - clus = clusters.get(clusterid); - } else { - ArrayModifiableDBIDs cids = DBIDUtil.newArray(1); - cids.add(it); - clus = makeCluster(it, nulldist, cids, hier); - // No need to store in clusters: cannot have another incoming pi - // pointer! - } - // The successor to join: - pi.assignVar(it, succ); // succ = pi(it) - if (DBIDUtil.equal(it, succ)) { - assert (root == null); - root = clus; - } else { - // Parent cluster: - int parentid = cluster_map.intValue(succ); - @SuppressWarnings("unchecked") - D depth = (D) new DoubleDistance(lambda.doubleValue(it)); - // Parent cluster exists - merge as a new cluster: - if (parentid >= 0) { - Cluster<DendrogramModel<D>> pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS, hier); - hier.add(pclus, clusters.get(parentid)); - hier.add(pclus, clus); - clusters.set(parentid, pclus); // Replace existing parent cluster - } else { - // Create a new, one-element, parent cluster. - parentid = cnum; - cnum++; - ArrayModifiableDBIDs cids = DBIDUtil.newArray(1); - cids.add(succ); - Cluster<DendrogramModel<D>> pclus = makeCluster(succ, depth, cids, hier); - hier.add(pclus, clus); - assert (clusters.size() == parentid); - clusters.add(pclus); // Remember parent cluster - cluster_map.putInt(succ, parentid); // Reference - } - } - - // Decrement counter - if (progress != null) { - progress.incrementProcessed(LOG); - } - } - - if (progress != null) { - progress.ensureCompleted(LOG); - } - // build hierarchy - final Clustering<DendrogramModel<D>> dendrogram = new Clustering<DendrogramModel<D>>("Single-Link-Dendrogram", "slink-dendrogram"); - dendrogram.addCluster(root); - - return dendrogram; - } - - /** - * Make the cluster for the given object - * - * @param lead Leading object - * @param depth Linkage depth - * @param members Member objects - * @param hier Cluster hierarchy - * @return Cluster - */ - private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members, ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier) { - final String name; - if (members.size() == 0) { - name = "merge_" + lead + "_" + depth; - } else if (depth.isInfiniteDistance()) { - assert (members.contains(lead)); - name = "object_" + lead; - } else { - name = "cluster_" + lead + "_" + depth; - } - Cluster<DendrogramModel<D>> cluster = new Cluster<DendrogramModel<D>>(name, members, new DendrogramModel<D>(depth), hier); - return cluster; - } - - @Override - public TypeInformation[] getInputTypeRestriction() { - return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); - } - - @Override - protected Logging getLogger() { - return LOG; - } - - /** - * Order a DBID collection by the lambda value. - * - * @author Erich Schubert - * - * @apiviz.exclude - * - * @param <D> Distance type - */ - private static final class CompareByLambda<D extends Distance<D>> implements Comparator<DBIDRef> { - /** - * Lambda storage - */ - private final DataStore<D> lambda; - - /** - * Constructor. - * - * @param lambda Lambda storage - */ - protected CompareByLambda(DataStore<D> lambda) { - this.lambda = lambda; - } - - @Override - public int compare(DBIDRef id1, DBIDRef id2) { - D k1 = lambda.get(id1); - D k2 = lambda.get(id2); - assert (k1 != null); - assert (k2 != null); - return k1.compareTo(k2); - } - } - - /** - * Order a DBID collection by the lambda value. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - private static final class CompareByDoubleLambda implements Comparator<DBIDRef> { - /** - * Lambda storage - */ - private final DoubleDistanceDataStore lambda; - - /** - * Constructor. - * - * @param lambda Lambda storage - */ - protected CompareByDoubleLambda(DoubleDistanceDataStore lambda) { - this.lambda = lambda; - } - - @Override - public int compare(DBIDRef id1, DBIDRef id2) { - double k1 = lambda.doubleValue(id1); - double k2 = lambda.doubleValue(id2); - return Double.compare(k1, k2); - } - } - - /** - * Parameterization class. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { - /** - * The minimum number of clusters to extract - */ - public static final OptionID SLINK_MINCLUSTERS_ID = new OptionID("slink.minclusters", "The maximum number of clusters to extract."); - - protected int minclusters = -1; - - @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter minclustersP = new IntParameter(SLINK_MINCLUSTERS_ID); - minclustersP.addConstraint(new GreaterEqualConstraint(1)); - minclustersP.setOptional(true); - if (config.grab(minclustersP)) { - minclusters = minclustersP.intValue(); - } - } - - @Override - protected SLINK<O, D> makeInstance() { - return new SLINK<O, D>(distanceFunction, minclusters); - } - } -} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java index f3b59c42..95d9f23c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -151,7 +151,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("SNNClustering", relation.size(), LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; - resultList = new ArrayList<ModifiableDBIDs>(); + resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(relation.size()); if(relation.size() >= minpts) { @@ -183,11 +183,11 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple clusprog.setCompleted(LOG); } - Clustering<Model> result = new Clustering<Model>("Shared-Nearest-Neighbor Clustering", "snn-clustering"); + Clustering<Model> result = new Clustering<>("Shared-Nearest-Neighbor Clustering", "snn-clustering"); for(Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext();) { - result.addCluster(new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER)); + result.addToplevelCluster(new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER)); } - result.addCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); + result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); return result; } @@ -322,7 +322,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple Class<SharedNearestNeighborSimilarityFunction<O>> cls = ClassGenericsUtil.uglyCastIntoSubclass(SharedNearestNeighborSimilarityFunction.class); similarityFunction = config.tryInstantiate(cls); - DistanceParameter<IntegerDistance> epsilonP = new DistanceParameter<IntegerDistance>(EPSILON_ID, IntegerDistance.FACTORY); + DistanceParameter<IntegerDistance> epsilonP = new DistanceParameter<>(EPSILON_ID, IntegerDistance.FACTORY); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } @@ -336,7 +336,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple @Override protected SNNClustering<O> makeInstance() { - return new SNNClustering<O>(similarityFunction, epsilon, minpts); + return new SNNClustering<>(similarityFunction, epsilon, minpts); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java index 1cb1eb0d..0d82add9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -66,8 +66,9 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.IntegerPriorityObject; +import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -263,8 +264,8 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin */ private Relation<ParameterizationFunction> preprocess(Database db, Relation<V> vrel) { DBIDs ids = vrel.getDBIDs(); - SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<ParameterizationFunction>(ParameterizationFunction.class); - MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<ParameterizationFunction>(db, type, ids); + SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<>(ParameterizationFunction.class); + MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<>(db, type, ids); // Project for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { @@ -284,12 +285,12 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @return a mapping of subspace dimensionalities to clusters */ private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) { - Clustering<Model> res = new Clustering<Model>("CASH clustering", "cash-clustering"); + Clustering<Model> res = new Clustering<>("CASH clustering", "cash-clustering"); final int dim = dimensionality(relation); // init heap - Heap<IntegerPriorityObject<CASHInterval>> heap = new Heap<IntegerPriorityObject<CASHInterval>>(); + ObjectHeap<IntegerPriorityObject<CASHInterval>> heap = new ComparableMinHeap<>(); ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs()); initHeap(heap, relation, dim, noiseIDs); @@ -338,7 +339,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // add result of dim-1 to this result Clustering<Model> res_dim_minus_1 = doRun(db, progress); for (Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) { - res.addCluster(cluster); + res.addToplevelCluster(cluster); noiseIDs.removeDBIDs(cluster.getIDs()); clusterIDs.addDBIDs(cluster.getIDs()); processedIDs.addDBIDs(cluster.getIDs()); @@ -349,23 +350,23 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin else { LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs()); Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les)); - res.addCluster(c); + res.addToplevelCluster(c); noiseIDs.removeDBIDs(interval.getIDs()); clusterIDs.addDBIDs(interval.getIDs()); processedIDs.addDBIDs(interval.getIDs()); } // Rebuild heap - ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<IntegerPriorityObject<CASHInterval>>(heap.size()); - for (IntegerPriorityObject<CASHInterval> obj : heap) { - heapVector.add(obj); + ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<>(heap.size()); + for (ObjectHeap.UnsortedIter<IntegerPriorityObject<CASHInterval>> iter = heap.unsortedIter(); iter.valid(); iter.advance()) { + heapVector.add(iter.get()); } heap.clear(); for (IntegerPriorityObject<CASHInterval> pair : heapVector) { CASHInterval currentInterval = pair.getObject(); currentInterval.removeIDs(clusterIDs); if (currentInterval.getIDs().size() >= minPts) { - heap.add(new IntegerPriorityObject<CASHInterval>(currentInterval.priority(), currentInterval)); + heap.add(new IntegerPriorityObject<>(currentInterval.priority(), currentInterval)); } } @@ -378,12 +379,12 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin if (!noiseIDs.isEmpty()) { if (dim == noiseDim) { Cluster<Model> c = new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER); - res.addCluster(c); + res.addToplevelCluster(c); processedIDs.addDBIDs(noiseIDs); } else if (noiseIDs.size() >= minPts) { LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs); Cluster<Model> c = new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)); - res.addCluster(c); + res.addToplevelCluster(c); processedIDs.addDBIDs(noiseIDs); } } @@ -427,7 +428,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @param dim the dimensionality of the database * @param ids the ids of the database */ - private void initHeap(Heap<IntegerPriorityObject<CASHInterval>> heap, Relation<ParameterizationFunction> relation, int dim, DBIDs ids) { + private void initHeap(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap, Relation<ParameterizationFunction> relation, int dim, DBIDs ids) { CASHIntervalSplit split = new CASHIntervalSplit(relation, minPts); // determine minimum and maximum function value of all functions @@ -479,7 +480,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin ModifiableDBIDs intervalIDs = split.determineIDs(ids, alphaInterval, d_mins[i], d_maxs[i]); if (intervalIDs != null && intervalIDs.size() >= minPts) { CASHInterval rootInterval = new CASHInterval(alphaMin, alphaMax, split, intervalIDs, -1, 0, d_mins[i], d_maxs[i]); - heap.add(new IntegerPriorityObject<CASHInterval>(rootInterval.priority(), rootInterval)); + heap.add(new IntegerPriorityObject<>(rootInterval.priority(), rootInterval)); } } @@ -503,8 +504,8 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin */ private MaterializedRelation<ParameterizationFunction> buildDB(int dim, Matrix basis, DBIDs ids, Relation<ParameterizationFunction> relation) { ProxyDatabase proxy = new ProxyDatabase(ids); - SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<ParameterizationFunction>(ParameterizationFunction.class); - MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<ParameterizationFunction>(proxy, type, ids); + SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<>(ParameterizationFunction.class); + MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<>(proxy, type, ids); proxy.addRelation(prep); // Project @@ -566,7 +567,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin private double sinusProduct(int start, int end, double[] alpha) { double result = 1; for (int j = start; j < end; j++) { - result *= StrictMath.sin(alpha[j]); + result *= Math.sin(alpha[j]); } return result; } @@ -578,7 +579,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @param heap the heap storing the intervals * @return the next ''best'' interval at maximum level */ - private CASHInterval determineNextIntervalAtMaxLevel(Heap<IntegerPriorityObject<CASHInterval>> heap) { + private CASHInterval determineNextIntervalAtMaxLevel(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap) { CASHInterval next = doDetermineNextIntervalAtMaxLevel(heap); // noise path was chosen while (next == null) { @@ -598,7 +599,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @param heap the heap storing the intervals * @return the next ''best'' interval at maximum level */ - private CASHInterval doDetermineNextIntervalAtMaxLevel(Heap<IntegerPriorityObject<CASHInterval>> heap) { + private CASHInterval doDetermineNextIntervalAtMaxLevel(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap) { CASHInterval interval = heap.poll().getObject(); int dim = interval.getDimensionality(); while (true) { @@ -632,10 +633,10 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin int comp = interval.getLeftChild().compareTo(interval.getRightChild()); if (comp < 0) { bestInterval = interval.getRightChild(); - heap.add(new IntegerPriorityObject<CASHInterval>(interval.getLeftChild().priority(), interval.getLeftChild())); + heap.add(new IntegerPriorityObject<>(interval.getLeftChild().priority(), interval.getLeftChild())); } else { bestInterval = interval.getLeftChild(); - heap.add(new IntegerPriorityObject<CASHInterval>(interval.getRightChild().priority(), interval.getRightChild())); + heap.add(new IntegerPriorityObject<>(interval.getRightChild().priority(), interval.getRightChild())); } } else if (interval.getLeftChild() == null) { bestInterval = interval.getRightChild(); @@ -733,8 +734,8 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin DBIDs ids = interval.getIDs(); ProxyDatabase proxy = new ProxyDatabase(ids); int dim = dimensionality(relation); - SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.FACTORY, dim); - MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids); + SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dim); + MaterializedRelation<DoubleVector> prep = new MaterializedRelation<>(proxy, type, ids); proxy.addRelation(prep); // Project @@ -792,8 +793,8 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, DBIDs ids) { ProxyDatabase proxy = new ProxyDatabase(ids); int dim = dimensionality(relation); - SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.FACTORY, dim); - MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids); + SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dim); + MaterializedRelation<DoubleVector> prep = new MaterializedRelation<>(proxy, type, ids); proxy.addRelation(prep); // Project @@ -864,7 +865,7 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin @Override protected CASH<NumberVector<?>> makeInstance() { - return new CASH<NumberVector<?>>(minpts, maxlevel, mindim, jitter, adjust); + return new CASH<>(minpts, maxlevel, mindim, jitter, adjust); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java index ac50559e..9a4b8512 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -185,7 +185,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs LocalProjectionIndex<V, ?> preprocin = partitionDistanceQuery.getIndex(); // partitioning - Map<Integer, ModifiableDBIDs> partitionMap = new HashMap<Integer, ModifiableDBIDs>(); + Map<Integer, ModifiableDBIDs> partitionMap = new HashMap<>(); FiniteProgress partitionProgress = LOG.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), LOG) : null; int processed = 1; @@ -214,7 +214,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs // convert for partition algorithm. // TODO: do this with DynamicDBIDs instead - Map<Integer, DBIDs> pmap = new HashMap<Integer, DBIDs>(); + Map<Integer, DBIDs> pmap = new HashMap<>(); for(Entry<Integer, ModifiableDBIDs> ent : partitionMap.entrySet()) { pmap.put(ent.getKey(), ent.getValue()); } @@ -230,14 +230,14 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs * @param query The preprocessor based query function */ private Clustering<Model> runPartitionAlgorithm(Relation<V> relation, Map<Integer, DBIDs> partitionMap, DistanceQuery<V, D> query) { - Clustering<Model> result = new Clustering<Model>("COPAC clustering", "copac-clustering"); + Clustering<Model> result = new Clustering<>("COPAC clustering", "copac-clustering"); // TODO: use an extra finite progress for the partitions? for(Entry<Integer, DBIDs> pair : partitionMap.entrySet()) { // noise partition if(pair.getKey() == RelationUtil.dimensionality(relation)) { // Make a Noise cluster - result.addCluster(new Cluster<Model>(pair.getValue(), true, ClusterModel.CLUSTER)); + result.addToplevelCluster(new Cluster<Model>(pair.getValue(), true, ClusterModel.CLUSTER)); } else { DBIDs partids = pair.getValue(); @@ -251,10 +251,10 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs // Re-Wrap resulting Clusters as DimensionModel clusters. for(Cluster<Model> clus : p.getAllClusters()) { if(clus.isNoise()) { - result.addCluster(new Cluster<Model>(clus.getIDs(), true, ClusterModel.CLUSTER)); + result.addToplevelCluster(new Cluster<Model>(clus.getIDs(), true, ClusterModel.CLUSTER)); } else { - result.addCluster(new Cluster<Model>(clus.getIDs(), new DimensionModel(pair.getKey()))); + result.addToplevelCluster(new Cluster<Model>(clus.getIDs(), new DimensionModel(pair.getKey()))); } } } @@ -316,12 +316,12 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ClassParameter<Factory<V, ?>> indexP = new ClassParameter<LocalProjectionIndex.Factory<V, ?>>(PREPROCESSOR_ID, LocalProjectionIndex.Factory.class); + ClassParameter<Factory<V, ?>> indexP = new ClassParameter<>(PREPROCESSOR_ID, LocalProjectionIndex.Factory.class); if(config.grab(indexP)) { indexI = indexP.instantiateClass(config); } - ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>> pdistP = new ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>>(PARTITION_DISTANCE_ID, FilteredLocalPCABasedDistanceFunction.class, LocallyWeightedDistanceFunction.class); + ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>> pdistP = new ObjectParameter<>(PARTITION_DISTANCE_ID, FilteredLocalPCABasedDistanceFunction.class, LocallyWeightedDistanceFunction.class); if(config.grab(pdistP)) { ListParameterization predefinedDist = new ListParameterization(); predefinedDist.addParameter(IndexBasedDistanceFunction.INDEX_ID, indexI); @@ -332,7 +332,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs } // Parameterize algorithm: - ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<ClusteringAlgorithm<Clustering<Model>>>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class); + ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class); if(config.grab(algP)) { ListParameterization predefined = new ListParameterization(); predefined.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI); @@ -348,7 +348,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs @Override protected COPAC<V, D> makeInstance() { - return new COPAC<V, D>(pdistI, algC, algO); + return new COPAC<>(pdistI, algC, algO); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java index 7e7314b4..d535e136 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ERiC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,8 +25,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; import java.util.ArrayList; import java.util.List; -import java.util.SortedMap; -import java.util.TreeMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; @@ -58,6 +56,8 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult; import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; +import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -125,7 +125,7 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null; // run COPAC - if(stepprog != null) { + if (stepprog != null) { stepprog.beginStep(1, "Preprocessing local correlation dimensionalities and partitioning data", LOG); } Clustering<Model> copacResult = copacAlgorithm.run(relation); @@ -133,16 +133,16 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin DistanceQuery<V, IntegerDistance> query = copacAlgorithm.getPartitionDistanceQuery(); // extract correlation clusters - if(stepprog != null) { + if (stepprog != null) { stepprog.beginStep(2, "Extract correlation clusters", LOG); } - SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality); - if(LOG.isDebugging()) { + List<List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality); + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters..."); - for(Integer corrDim : clusterMap.keySet()) { + for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) { List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim); msg.append("\n\ncorrDim ").append(corrDim); - for(Cluster<CorrelationModel<V>> cluster : correlationClusters) { + for (Cluster<CorrelationModel<V>> cluster : correlationClusters) { msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size()); // .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex()); // msg.append("\n basis " + @@ -152,45 +152,45 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } LOG.debugFine(msg.toString()); } - if(LOG.isVerbose()) { + if (LOG.isVerbose()) { int clusters = 0; - for(List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap.values()) { + for (List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap) { clusters += correlationClusters.size(); } LOG.verbose(clusters + " clusters extracted."); } // build hierarchy - if(stepprog != null) { + if (stepprog != null) { stepprog.beginStep(3, "Building hierarchy", LOG); } - buildHierarchy(clusterMap, query); - if(LOG.isDebugging()) { + Clustering<CorrelationModel<V>> clustering = new Clustering<>("ERiC clustering", "eric-clustering"); + buildHierarchy(clustering, clusterMap, query); + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder("Step 3: Build hierarchy"); - for(Integer corrDim : clusterMap.keySet()) { + for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) { List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim); - for(Cluster<CorrelationModel<V>> cluster : correlationClusters) { + for (Cluster<CorrelationModel<V>> cluster : correlationClusters) { msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size()); // .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex()); - for(int i = 0; i < cluster.getParents().size(); i++) { - msg.append("\n parent ").append(cluster.getParents().get(i)); + for (Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) { + msg.append("\n parent ").append(iter.get()); } - for(int i = 0; i < cluster.numChildren(); i++) { - msg.append("\n child ").append(cluster.getChildren().get(i)); + for (Iter<Cluster<CorrelationModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) { + msg.append("\n child ").append(iter.get()); } } } LOG.debugFine(msg.toString()); } - if(stepprog != null) { + if (stepprog != null) { stepprog.setCompleted(LOG); } - Clustering<CorrelationModel<V>> result = new Clustering<CorrelationModel<V>>("ERiC clustering", "eric-clustering"); - for(Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.lastKey())) { - result.addCluster(rc); + for (Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.size() - 1)) { + clustering.addToplevelCluster(rc); } - return result; + return clustering; } /** @@ -203,77 +203,75 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * * @param database the database containing the objects * @param dimensionality the dimensionality of the feature space - * @return a mapping of correlation dimension to maps of clusters + * @return a list of clusters for each dimensionality */ - private SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> copacResult, Relation<V> database, int dimensionality) { + private List<List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> copacResult, Relation<V> database, int dimensionality) { // result - SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap = new TreeMap<Integer, List<Cluster<CorrelationModel<V>>>>(); + List<List<Cluster<CorrelationModel<V>>>> clusterMap = new ArrayList<>(); + for (int i = 0; i <= dimensionality; i++) { + clusterMap.add(new ArrayList<Cluster<CorrelationModel<V>>>()); + } // noise cluster containing all noise objects over all partitions Cluster<Model> noise = null; // iterate over correlation dimensions - for(Cluster<Model> clus : copacResult.getAllClusters()) { + for (Cluster<Model> clus : copacResult.getAllClusters()) { DBIDs group = clus.getIDs(); - if(clus.getModel() != null && clus.getModel() instanceof DimensionModel) { + if (clus.getModel() != null && clus.getModel() instanceof DimensionModel) { int correlationDimension = ((DimensionModel) clus.getModel()).getDimension(); ListParameterization parameters = pcaParameters(correlationDimension); Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class); PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls); - for(ParameterException e : parameters.getErrors()) { - LOG.warning("Error in internal parameterization: " + e.getMessage()); - } + parameters.failOnErrors(); // get cluster list for this dimension. List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(correlationDimension); - if(correlationClusters == null) { - correlationClusters = new ArrayList<Cluster<CorrelationModel<V>>>(); - clusterMap.put(correlationDimension, correlationClusters); - } - PCAFilteredResult pcares = pca.processIds(group, database); V centroid = Centroid.make(database, group).toVector(database); - Cluster<CorrelationModel<V>> correlationCluster = new Cluster<CorrelationModel<V>>("[" + correlationDimension + "_" + correlationClusters.size() + "]", group, new CorrelationModel<V>(pcares, centroid), new ArrayList<Cluster<CorrelationModel<V>>>(), new ArrayList<Cluster<CorrelationModel<V>>>()); + Cluster<CorrelationModel<V>> correlationCluster = new Cluster<>("[" + correlationDimension + "_" + correlationClusters.size() + "]", group, new CorrelationModel<>(pcares, centroid)); correlationClusters.add(correlationCluster); } // partition containing noise - else if(clus.getModel() != null && clus.isNoise()) { - if(noise == null) { + else if (clus.getModel() != null && clus.isNoise()) { + if (noise == null) { noise = clus; - } - else { + } else { ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs()); merged.addDBIDs(clus.getIDs()); noise.setIDs(merged); } - } - else { + } else { throw new IllegalStateException("Unexpected group returned: " + clus.getClass().getName()); } } - if(noise != null && noise.size() > 0) { + if (noise != null && noise.size() > 0) { // get cluster list for this dimension. List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(dimensionality); - if(correlationClusters == null) { - correlationClusters = new ArrayList<Cluster<CorrelationModel<V>>>(); - clusterMap.put(dimensionality, correlationClusters); - } ListParameterization parameters = pcaParameters(dimensionality); Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class); PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls); - for(ParameterException e : parameters.getErrors()) { + for (ParameterException e : parameters.getErrors()) { LOG.warning("Error in internal parameterization: " + e.getMessage()); } PCAFilteredResult pcares = pca.processIds(noise.getIDs(), database); V centroid = Centroid.make(database, noise.getIDs()).toVector(database); - Cluster<CorrelationModel<V>> correlationCluster = new Cluster<CorrelationModel<V>>("[noise]", noise.getIDs(), new CorrelationModel<V>(pcares, centroid), new ArrayList<Cluster<CorrelationModel<V>>>(), new ArrayList<Cluster<CorrelationModel<V>>>()); + Cluster<CorrelationModel<V>> correlationCluster = new Cluster<>("[noise]", noise.getIDs(), new CorrelationModel<>(pcares, centroid)); correlationClusters.add(correlationCluster); } + // Delete dimensionalities not found. + for (int i = dimensionality; i > 0; i--) { + if (clusterMap.get(i).size() > 0) { + break; + } + clusterMap.remove(i); + } + return clusterMap; } @@ -292,48 +290,48 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin return parameters; } - private void buildHierarchy(SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) { - StringBuilder msg = new StringBuilder(); + private void buildHierarchy(Clustering<CorrelationModel<V>> clustering, List<List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) { + StringBuilder msg = LOG.isDebuggingFine() ? new StringBuilder() : null; + Hierarchy<Cluster<CorrelationModel<V>>> hier = clustering.getClusterHierarchy(); DBSCAN<V, DoubleDistance> dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query)); - if(dbscan == null) { + if (dbscan == null) { // TODO: appropriate exception class? throw new IllegalArgumentException("ERiC was run without DBSCAN as COPAC algorithm!"); } DistanceFunction<? super V, ?> dfun = ProxyDistanceFunction.unwrapDistance(dbscan.getDistanceFunction()); ERiCDistanceFunction distanceFunction = ClassGenericsUtil.castWithGenericsOrNull(ERiCDistanceFunction.class, dfun); - if(distanceFunction == null) { + if (distanceFunction == null) { // TODO: appropriate exception class? throw new IllegalArgumentException("ERiC was run without ERiCDistanceFunction as distance function: got " + dfun.getClass()); } - Integer lambda_max = clusterMap.lastKey(); + // Find maximum dimensionality found: + int lambda_max = clusterMap.size() - 1; - for(Integer childCorrDim : clusterMap.keySet()) { + for (int childCorrDim = 0; childCorrDim < lambda_max; childCorrDim++) { List<Cluster<CorrelationModel<V>>> children = clusterMap.get(childCorrDim); - SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> parentMap = clusterMap.tailMap(childCorrDim + 1); - if(LOG.isDebugging()) { + // SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> parentMap = + // clusterMap.tailMap(childCorrDim + 1); + if (msg != null) { msg.append("\ncorrdim ").append(childCorrDim); - msg.append("\nparents ").append(parentMap.keySet()); + // msg.append("\nparents ").append(parentMap.keySet()); } - for(Cluster<CorrelationModel<V>> child : children) { - for(Integer parentCorrDim : parentMap.keySet()) { - List<Cluster<CorrelationModel<V>>> parents = parentMap.get(parentCorrDim); - for(Cluster<CorrelationModel<V>> parent : parents) { + for (Cluster<CorrelationModel<V>> child : children) { + for (int parentCorrDim = childCorrDim + 1; parentCorrDim <= lambda_max; parentCorrDim++) { + List<Cluster<CorrelationModel<V>>> parents = clusterMap.get(parentCorrDim); + for (Cluster<CorrelationModel<V>> parent : parents) { int subspaceDim_parent = parent.getModel().getPCAResult().getCorrelationDimension(); - if(subspaceDim_parent == lambda_max && child.getParents().isEmpty()) { - parent.getChildren().add(child); - child.getParents().add(parent); - if(LOG.isDebugging()) { + if (subspaceDim_parent == lambda_max && hier.numParents(child) == 0) { + clustering.addChildCluster(parent, child); + if (msg != null) { msg.append('\n').append(parent).append(" is parent of ").append(child); } - } - else { + } else { BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult()); - if(!dist.bitValue() && (child.getParents().isEmpty() || !isParent(distanceFunction, parent, child.getParents()))) { - parent.getChildren().add(child); - child.getParents().add(parent); - if(LOG.isDebugging()) { + if (!dist.bitValue() && (hier.numParents(child) == 0 || !isParent(distanceFunction, parent, hier.iterParents(child)))) { + clustering.addChildCluster(parent, child); + if (msg != null) { msg.append('\n').append(parent).append(" is parent of ").append(child); } } @@ -342,7 +340,7 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } } - if(LOG.isDebugging()) { + if (msg != null) { LOG.debugFine(msg.toString()); } @@ -355,32 +353,32 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @param distanceFunction the distance function for distance computation * between the clusters * @param parent the parent to be tested - * @param children the list of children to be tested + * @param iter the list of children to be tested * @return true, if the specified parent cluster is a parent of one child of * the children clusters, false otherwise */ - private boolean isParent(ERiCDistanceFunction distanceFunction, Cluster<CorrelationModel<V>> parent, List<Cluster<CorrelationModel<V>>> children) { - - StringBuilder msg = new StringBuilder(); + private boolean isParent(ERiCDistanceFunction distanceFunction, Cluster<CorrelationModel<V>> parent, Iter<Cluster<CorrelationModel<V>>> iter) { + StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null; - for(Cluster<CorrelationModel<V>> child : children) { - if(parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) { + for (; iter.valid(); iter.advance()) { + Cluster<CorrelationModel<V>> child = iter.get(); + if (parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) { return false; } BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult()); - if(LOG.isDebugging()) { + if (msg != null) { msg.append("\ndist(").append(child).append(" - ").append(parent).append(") = ").append(dist); } - if(!dist.bitValue()) { - if(LOG.isDebugging()) { - LOG.debugFine(msg.toString()); + if (!dist.bitValue()) { + if (msg != null) { + LOG.debugFine(msg); } return true; } } - if(LOG.isDebugging()) { + if (msg != null) { LOG.debugFine(msg.toString()); } return false; @@ -395,7 +393,7 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin protected Logging getLogger() { return LOG; } - + /** * Parameterization class. * @@ -418,7 +416,7 @@ public class ERiC<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin @Override protected ERiC<V> makeInstance() { - return new ERiC<V>(copac); + return new ERiC<>(copac); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java index f56342e0..5235273c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/FourC.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -115,7 +115,7 @@ public class FourC<V extends NumberVector<?>> extends AbstractProjectedDBSCAN<Cl @Override protected FourC<O> makeInstance() { - return new FourC<O>(epsilon, minpts, outerdist, lambda); + return new FourC<>(epsilon, minpts, outerdist, lambda); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java index 759e8f59..d1b714bf 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java @@ -64,7 +64,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; */
@Title("Mining Hierarchies of Correlation Clusters")
@Description("Algorithm for detecting hierarchies of correlation clusters.")
-@Reference(authors = "E. Achtert, C. Böhm, P. Kröger, A. Zimek", title = "Mining Hierarchies of Correlation Clusterse", booktitle = "Proc. Int. Conf. on Scientific and Statistical Database Management (SSDBM'06), Vienna, Austria, 2006", url = "http://dx.doi.org/10.1109/SSDBM.2006.35")
+@Reference(authors = "E. Achtert, C. Böhm, P. Kröger, A. Zimek", title = "Mining Hierarchies of Correlation Clusters", booktitle = "Proc. Int. Conf. on Scientific and Statistical Database Management (SSDBM'06), Vienna, Austria, 2006", url = "http://dx.doi.org/10.1109/SSDBM.2006.35")
public class HiCO<V extends NumberVector<?>> extends OPTICS<V, PCACorrelationDistance> {
/**
* The logger for this class.
@@ -207,7 +207,7 @@ public class HiCO<V extends NumberVector<?>> extends OPTICS<V, PCACorrelationDis @Override
protected HiCO<V> makeInstance() {
- return new HiCO<V>(distance, mu);
+ return new HiCO<>(distance, mu);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java index fdea8b35..f9531be0 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -159,7 +159,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { * @return Clustering result */ public Clustering<Model> run(Database database, Relation<NumberVector<?>> relation) { - Clustering<Model> ret = new Clustering<Model>("LMCLUS Clustering", "lmclus-clustering"); + Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering"); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null; IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null; ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs()); @@ -204,10 +204,10 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } // New cluster found // TODO: annotate cluster with dimensionality - final Cluster<Model> cluster = new Cluster<Model>(current); + final Cluster<Model> cluster = new Cluster<>(current); cluster.setName("Cluster_" + lmDim + "d_" + cnum); cnum++; - ret.addCluster(cluster); + ret.addToplevelCluster(cluster); // Remove from main working set. unclustered.removeDBIDs(current); if (progress != null) { @@ -219,7 +219,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { } // Remaining objects are noise if (unclustered.size() > 0) { - ret.addCluster(new Cluster<Model>(unclustered, true)); + ret.addToplevelCluster(new Cluster<>(unclustered, true)); } if (progress != null) { progress.setProcessed(relation.size(), LOG); @@ -281,7 +281,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> { // Build orthogonal basis from remainder Matrix basis; { - List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1); + List<Vector> vectors = new ArrayList<>(sample.size() - 1); for (; iter.valid(); iter.advance()) { Vector vec = relation.get(iter).getColumnVector(); vectors.add(vec.minusEquals(originV)); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java index f567098b..a9c67a58 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -42,10 +42,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.generic.GenericDistanceDBIDList; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.GenericDistanceDBIDList; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; @@ -178,9 +178,9 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } // get the result - Clustering<Model> r = new Clustering<Model>("ORCLUS clustering", "orclus-clustering"); + Clustering<Model> r = new Clustering<>("ORCLUS clustering", "orclus-clustering"); for (ORCLUSCluster c : clusters) { - r.addCluster(new Cluster<Model>(c.objectIDs, ClusterModel.CLUSTER)); + r.addToplevelCluster(new Cluster<Model>(c.objectIDs, ClusterModel.CLUSTER)); } return r; } catch (Exception e) { @@ -198,7 +198,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri private List<ORCLUSCluster> initialSeeds(Relation<V> database, int k) { DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, rnd); NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database); - List<ORCLUSCluster> seeds = new ArrayList<ORCLUSCluster>(); + List<ORCLUSCluster> seeds = new ArrayList<>(); for (DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) { seeds.add(new ORCLUSCluster(database.get(iter), iter, factory)); } @@ -222,7 +222,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri } // projected centroids of the clusters - List<V> projectedCentroids = new ArrayList<V>(clusters.size()); + List<V> projectedCentroids = new ArrayList<>(clusters.size()); for (ORCLUSCluster c : clusters) { projectedCentroids.add(projection(c, c.centroid, factory)); } @@ -270,7 +270,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri private Matrix findBasis(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, ORCLUSCluster cluster, int dim) { // covariance matrix of cluster // Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs); - GenericDistanceDBIDList<DoubleDistance> results = new GenericDistanceDBIDList<DoubleDistance>(cluster.objectIDs.size()); + GenericDistanceDBIDList<DoubleDistance> results = new GenericDistanceDBIDList<>(cluster.objectIDs.size()); for (DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) { DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it)); results.add(distance, it); @@ -303,7 +303,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri * @param d_new the new dimensionality of the subspaces for each seed */ private void merge(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters, int k_new, int d_new, IndefiniteProgress cprogress) { - ArrayList<ProjectedEnergy> projectedEnergies = new ArrayList<ProjectedEnergy>(); + ArrayList<ProjectedEnergy> projectedEnergies = new ArrayList<>(); for (int i = 0; i < clusters.size(); i++) { for (int j = 0; j < clusters.size(); j++) { if (i >= j) { @@ -387,16 +387,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri ORCLUSCluster c_ij = union(database, distFunc, c_i, c_j, dim); NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database); - DoubleDistance sum = getDistanceFunction().getDistanceFactory().nullDistance(); + double sum = 0.; V c_proj = projection(c_ij, c_ij.centroid, factory); for (DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) { V o_proj = projection(c_ij, database.get(iter), factory); - DoubleDistance dist = distFunc.distance(o_proj, c_proj); - sum = sum.plus(dist.times(dist)); + double dist = distFunc.distance(o_proj, c_proj).doubleValue(); + sum += dist * dist; } - DoubleDistance projectedEnergy = sum.times(1.0 / c_ij.objectIDs.size()); + sum /= c_ij.objectIDs.size(); - return new ProjectedEnergy(i, j, c_ij, projectedEnergy); + return new ProjectedEnergy(i, j, c_ij, sum); } /** @@ -520,9 +520,9 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri ORCLUSCluster cluster; - DoubleDistance projectedEnergy; + double projectedEnergy; - ProjectedEnergy(int i, int j, ORCLUSCluster cluster, DoubleDistance projectedEnergy) { + ProjectedEnergy(int i, int j, ORCLUSCluster cluster, double projectedEnergy) { this.i = i; this.j = j; this.cluster = cluster; @@ -538,7 +538,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri */ @Override public int compareTo(ProjectedEnergy o) { - return this.projectedEnergy.compareTo(o.projectedEnergy); + return Double.compare(projectedEnergy, o.projectedEnergy); } } @@ -606,7 +606,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri @Override protected ORCLUS<V> makeInstance() { - return new ORCLUS<V>(k, k_i, l, alpha, rnd, pca); + return new ORCLUS<>(k, k_i, l, alpha, rnd, pca); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java index 0153ddc3..95cb2e58 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHInterval.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java index 12f10725..328fe3b3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/CASHIntervalSplit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -81,8 +81,8 @@ public class CASHIntervalSplit { this.database = database; this.minPts = minPts; - this.f_minima = new HashMap<HyperBoundingBox, Map<DBID, Double>>(); - this.f_maxima = new HashMap<HyperBoundingBox, Map<DBID, Double>>(); + this.f_minima = new HashMap<>(); + this.f_maxima = new HashMap<>(); } /** @@ -108,9 +108,9 @@ public class CASHIntervalSplit { Map<DBID, Double> minima = f_minima.get(interval); Map<DBID, Double> maxima = f_maxima.get(interval); if(minima == null || maxima == null) { - minima = new HashMap<DBID, Double>(); + minima = new HashMap<>(); f_minima.put(interval, minima); - maxima = new HashMap<DBID, Double>(); + maxima = new HashMap<>(); f_maxima.put(interval, maxima); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/ParameterizationFunction.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/ParameterizationFunction.java index 56e68bfe..5c690feb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/ParameterizationFunction.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/ParameterizationFunction.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java index 8b6d104c..bfc272fd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/cash/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java index 665de632..89d3c930 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java index a4440a29..27cc48d6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/CorePredicate.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java index 2b946f1c..545a8171 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,11 +32,11 @@ import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; @@ -91,7 +91,7 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh public <T> NeighborPredicate.Instance<T> instantiate(Database database, SimpleTypeInformation<?> type) { DistanceQuery<O, D> dq = QueryUtil.getDistanceQuery(database, distFunc); RangeQuery<O, D> rq = database.getRangeQuery(dq); - return (NeighborPredicate.Instance<T>) new Instance<D>(epsilon, rq, dq.getRelation().getDBIDs()); + return (NeighborPredicate.Instance<T>) new Instance<>(epsilon, rq, dq.getRelation().getDBIDs()); } @Override @@ -109,7 +109,7 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh * * @author Erich Schubert */ - public static class Instance<D extends Distance<D>> implements NeighborPredicate.Instance<DistanceDBIDResult<D>> { + public static class Instance<D extends Distance<D>> implements NeighborPredicate.Instance<DistanceDBIDList<D>> { /** * Range to query with */ @@ -145,12 +145,12 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh } @Override - public DistanceDBIDResult<D> getNeighbors(DBIDRef reference) { + public DistanceDBIDList<D> getNeighbors(DBIDRef reference) { return rq.getRangeForDBID(reference, epsilon); } @Override - public void addDBIDs(ModifiableDBIDs ids, DistanceDBIDResult<D> neighbors) { + public void addDBIDs(ModifiableDBIDs ids, DistanceDBIDList<D> neighbors) { ids.addDBIDs(neighbors); } } @@ -177,14 +177,14 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh protected void makeOptions(Parameterization config) { super.makeOptions(config); // Get a distance function. - ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<DistanceFunction<O, D>>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); D distanceFactory = null; if(config.grab(distanceP)) { distfun = distanceP.instantiateClass(config); distanceFactory = distfun.getDistanceFactory(); } // Get the epsilon parameter - DistanceParameter<D> epsilonP = new DistanceParameter<D>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory); + DistanceParameter<D> epsilonP = new DistanceParameter<>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } @@ -192,7 +192,7 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh @Override protected EpsilonNeighborPredicate<O, D> makeInstance() { - return new EpsilonNeighborPredicate<O, D>(epsilon, distfun); + return new EpsilonNeighborPredicate<>(epsilon, distfun); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java index ef1cb0dc..1e0a8642 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -32,6 +32,7 @@ import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.model.CoreObjectsModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; @@ -53,6 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** @@ -67,7 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @author Erich Schubert * @author Arthur Zimek - * + * * @apiviz.landmark * * @apiviz.has Instance @@ -92,22 +94,29 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl CorePredicate corepred; /** + * Track which objects are "core" objects. + */ + boolean coremodel = false; + + /** * Constructor for parameterized algorithm. * - * @param npred Neighbor predicate - * @param corepred Core point predicate + * @param npred Neighbor predicate. + * @param corepred Core point predicate. + * @param coremodel Keep track of core points. */ - public GeneralizedDBSCAN(NeighborPredicate npred, CorePredicate corepred) { + public GeneralizedDBSCAN(NeighborPredicate npred, CorePredicate corepred, boolean coremodel) { super(); this.npred = npred; this.corepred = corepred; + this.coremodel = coremodel; } @Override public Clustering<Model> run(Database database) { for (SimpleTypeInformation<?> t : npred.getOutputType()) { if (corepred.acceptsType(t)) { - return new Instance<Object>(npred.instantiate(database, t), corepred.instantiate(database, t)).run(); + return new Instance<>(npred.instantiate(database, t), corepred.instantiate(database, t), coremodel).run(); } } throw new AbortException("No compatible types found."); @@ -127,7 +136,7 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl * Instance for a particular data set. * * @author Erich Schubert - * + * * @apiviz.composedOf CorePredicate.Instance * @apiviz.composedOf NeighborPredicate.Instance */ @@ -135,17 +144,12 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl /** * Unprocessed IDs */ - private static final int UNPROCESSED = -2; - - /** - * Noise IDs - */ - private static final int NOISE = -1; + private static final int UNPROCESSED = 0; /** * Noise IDs */ - private static final int FIRST_CLUSTER = 0; + private static final int NOISE = 1; /** * The neighborhood predicate @@ -158,15 +162,22 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl final CorePredicate.Instance<T> corepred; /** + * Track which objects are "core" objects. + */ + boolean coremodel = false; + + /** * Full Constructor * * @param npred Neighborhood predicate * @param corepred Core object predicate + * @param coremodel Keep track of core points. */ - public Instance(NeighborPredicate.Instance<T> npred, CorePredicate.Instance<T> corepred) { + public Instance(NeighborPredicate.Instance<T> npred, CorePredicate.Instance<T> corepred, boolean coremodel) { super(); this.npred = npred; this.corepred = corepred; + this.coremodel = coremodel; } /** @@ -177,78 +188,85 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl public Clustering<Model> run() { final DBIDs ids = npred.getIDs(); // Setup progress logging - final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustering", ids.size(), LOG) : null; - final IndefiniteProgress clusprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters", LOG) : null; + final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Generalized DBSCAN Clustering", ids.size(), LOG) : null; + final IndefiniteProgress clusprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters found", LOG) : null; // (Temporary) store the cluster ID assigned. final WritableIntegerDataStore clusterids = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, UNPROCESSED); - // Note: these are not exact! + // Note: these are not exact, as objects may be stolen from noise. final TIntArrayList clustersizes = new TIntArrayList(); + clustersizes.add(0); // Unprocessed dummy value. + clustersizes.add(0); // Noise counter. // Implementation Note: using Integer objects should result in // reduced memory use in the HashMap! - int clusterid = FIRST_CLUSTER; - int clustersize = 0; - int noisesize = 0; + int clusterid = NOISE + 1; // Iterate over all objects in the database. - for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { + for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { // Skip already processed ids. - if(clusterids.intValue(id) != UNPROCESSED) { + if (clusterids.intValue(id) != UNPROCESSED) { continue; } // Evaluate Neighborhood predicate final T neighbors = npred.getNeighbors(id); // Evaluate Core-Point predicate: - if(corepred.isCorePoint(id, neighbors)) { + if (corepred.isCorePoint(id, neighbors)) { clusterids.putInt(id, clusterid); - clustersize = 1 + setbasedExpandCluster(clusterid, clusterids, neighbors, progress); + clustersizes.add(expandCluster(clusterid, clusterids, neighbors, progress)); // start next cluster on next iteration. - clustersizes.add(clustersize); - clustersize = 0; - clusterid += 1; - if(clusprogress != null) { + ++clusterid; + if (clusprogress != null) { clusprogress.setProcessed(clusterid, LOG); } - } - else { + } else { // otherwise, it's a noise point clusterids.putInt(id, NOISE); - noisesize += 1; + clustersizes.set(NOISE, clustersizes.get(NOISE) + 1); } // We've completed this element - if(progress != null) { + if (progress != null) { progress.incrementProcessed(LOG); } } // Finish progress logging. - if(progress != null) { + if (progress != null) { progress.ensureCompleted(LOG); } - if(clusprogress != null) { + if (clusprogress != null) { clusprogress.setCompleted(LOG); } // Transform cluster ID mapping into a clustering result: - ArrayList<ArrayModifiableDBIDs> clusterlists = new ArrayList<ArrayModifiableDBIDs>(clusterid + 1); - // add noise cluster storage - clusterlists.add(DBIDUtil.newArray(noisesize)); + ArrayList<ArrayModifiableDBIDs> clusterlists = new ArrayList<>(clusterid); + ArrayList<ArrayModifiableDBIDs> corelists = coremodel ? new ArrayList<ArrayModifiableDBIDs>(clusterid) : null; // add storage containers for clusters - for(int i = 0; i < clustersizes.size(); i++) { + for (int i = 0; i < clustersizes.size(); i++) { clusterlists.add(DBIDUtil.newArray(clustersizes.get(i))); + if (corelists != null) { + corelists.add(DBIDUtil.newArray(clustersizes.get(i))); + } } // do the actual inversion - for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { - int cluster = clusterids.intValue(id); - clusterlists.get(cluster + 1).add(id); + for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { + // Negative values are non-core points: + int cid = clusterids.intValue(id); + int cluster = Math.abs(cid); + clusterlists.get(cluster).add(id); + if (corelists != null && cid > NOISE) { + corelists.get(cluster).add(id); + } } clusterids.destroy(); - Clustering<Model> result = new Clustering<Model>("GDBSCAN", "gdbscan-clustering"); - int cid = 0; - for(ArrayModifiableDBIDs res : clusterlists) { - boolean isNoise = (cid == 0); - Cluster<Model> c = new Cluster<Model>(res, isNoise, ClusterModel.CLUSTER); - result.addCluster(c); - cid++; + Clustering<Model> result = new Clustering<>("GDBSCAN", "gdbscan-clustering"); + for (int cid = NOISE; cid < clusterlists.size(); cid++) { + boolean isNoise = (cid == NOISE); + Cluster<Model> c; + if (corelists != null) { + c = new Cluster<Model>(clusterlists.get(cid), isNoise, new CoreObjectsModel(corelists.get(cid))); + } else { + c = new Cluster<Model>(clusterlists.get(cid), isNoise, ClusterModel.CLUSTER); + } + result.addToplevelCluster(c); } return result; } @@ -263,28 +281,36 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl * * @return cluster size */ - protected int setbasedExpandCluster(final int clusterid, final WritableIntegerDataStore clusterids, final T neighbors, final FiniteProgress progress) { - int clustersize = 0; + protected int expandCluster(final int clusterid, final WritableIntegerDataStore clusterids, final T neighbors, final FiniteProgress progress) { + int clustersize = 1; // initial seed! final ArrayModifiableDBIDs activeSet = DBIDUtil.newArray(); npred.addDBIDs(activeSet, neighbors); // run expandCluster as long as this set is non-empty (non-recursive // implementation) - while(!activeSet.isEmpty()) { + while (!activeSet.isEmpty()) { final DBID id = activeSet.remove(activeSet.size() - 1); - clustersize += 1; // Assign object to cluster - final int oldclus = clusterids.putInt(id, clusterid); - if(oldclus == -2) { + final int oldclus = clusterids.intValue(id); + if (oldclus == NOISE) { + clustersize += 1; + // Non core point cluster member: + clusterids.putInt(id, -clusterid); + } else if (oldclus == UNPROCESSED) { + clustersize += 1; // expandCluster again: // Evaluate Neighborhood predicate final T newneighbors = npred.getNeighbors(id); // Evaluate Core-Point predicate - if(corepred.isCorePoint(id, newneighbors)) { + if (corepred.isCorePoint(id, newneighbors)) { // Note: the recursion is unrolled into iteration over the active // set. npred.addDBIDs(activeSet, newneighbors); + clusterids.putInt(id, clusterid); + } else { + // Non core point cluster member: + clusterids.putInt(id, -clusterid); } - if(progress != null) { + if (progress != null) { progress.incrementProcessed(LOG); } } @@ -302,43 +328,58 @@ public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>> impl */ public static class Parameterizer extends AbstractParameterizer { /** - * Neighborhood predicate + * Neighborhood predicate. */ NeighborPredicate npred = null; /** - * Core point predicate + * Core point predicate. */ CorePredicate corepred = null; /** - * Parameter for neighborhood predicate + * Track which objects are "core" objects. + */ + boolean coremodel = false; + + /** + * Parameter for neighborhood predicate. */ public static final OptionID NEIGHBORHOODPRED_ID = new OptionID("gdbscan.neighborhood", "Neighborhood predicate for GDBSCAN"); /** - * Parameter for core predicate + * Parameter for core predicate. */ public static final OptionID COREPRED_ID = new OptionID("gdbscan.core", "Core point predicate for GDBSCAN"); + /** + * Flag to keep track of core points. + */ + public static final OptionID COREMODEL_ID = new OptionID("gdbscan.core-model", "Use a model that keeps track of core points. Needs more memory."); + @Override protected void makeOptions(Parameterization config) { // Neighborhood predicate - ObjectParameter<NeighborPredicate> npredOpt = new ObjectParameter<NeighborPredicate>(NEIGHBORHOODPRED_ID, NeighborPredicate.class, EpsilonNeighborPredicate.class); - if(config.grab(npredOpt)) { + ObjectParameter<NeighborPredicate> npredOpt = new ObjectParameter<>(NEIGHBORHOODPRED_ID, NeighborPredicate.class, EpsilonNeighborPredicate.class); + if (config.grab(npredOpt)) { npred = npredOpt.instantiateClass(config); } // Core point predicate - ObjectParameter<CorePredicate> corepredOpt = new ObjectParameter<CorePredicate>(COREPRED_ID, CorePredicate.class, MinPtsCorePredicate.class); - if(config.grab(corepredOpt)) { + ObjectParameter<CorePredicate> corepredOpt = new ObjectParameter<>(COREPRED_ID, CorePredicate.class, MinPtsCorePredicate.class); + if (config.grab(corepredOpt)) { corepred = corepredOpt.instantiateClass(config); } + + Flag coremodelOpt = new Flag(COREMODEL_ID); + if (config.grab(coremodelOpt)) { + coremodel = coremodelOpt.isTrue(); + } } @Override protected GeneralizedDBSCAN makeInstance() { - return new GeneralizedDBSCAN(npred, corepred); + return new GeneralizedDBSCAN(npred, corepred, coremodel); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java index 47097f9b..a6e62e2e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java index ed927696..c3e1e8c9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/NeighborPredicate.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java index 8be23c7d..7ea3c7e4 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/package-info.java @@ -22,7 +22,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java new file mode 100644 index 00000000..72b6fb57 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java @@ -0,0 +1,84 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Centroid linkage clustering method, aka UPGMC: Unweighted Pair-Group Method + * using Centroids. + * + * Reference: + * <p> + * A. K. Jain and R. C. Dubes<br /> + * Algorithms for Clustering Data<br /> + * Prentice-Hall + * </p> + * + * @author Erich Schubert + */ +@Alias({ "centroid", "upgmc" }) +@Reference(authors = "A. K. Jain and R. C. Dubes", title = "Algorithms for Clustering Data", booktitle = "Algorithms for Clustering Data, Prentice-Hall") +public class CentroidLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final CentroidLinkageMethod STATIC = new CentroidLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public CentroidLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + final double wx = sizex / (double) (sizex + sizey); + final double wy = sizey / (double) (sizex + sizey); + final double beta = (sizex * sizey) / (double) ((sizex + sizey) * (sizex + sizey)); + return wx * dx + wy * dy - beta * dxy; + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected CentroidLinkageMethod makeInstance() { + return STATIC; + } + } +} // Sokal and Michener (1958), Gower (1967) diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java new file mode 100644 index 00000000..0cb47fa7 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java @@ -0,0 +1,70 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Complete-linkage clustering method. + * + * @author Erich Schubert + */ +@Alias({ "complete", "clink", "complete-link", "farthest-neighbor" }) +public class CompleteLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final CompleteLinkageMethod STATIC = new CompleteLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public CompleteLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + return Math.max(dx, dy); + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected CompleteLinkageMethod makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java new file mode 100644 index 00000000..ac5cb77c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java @@ -0,0 +1,854 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.list.array.TDoubleArrayList; + +import java.util.ArrayList; +import java.util.Comparator; + +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.DendrogramModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DBIDDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.workflow.AlgorithmStep; + +/** + * Extract a flat clustering from a full hierarchy, represented in pointer form. + * + * FIXME: re-check tie handling! + * + * @author Erich Schubert + * + * @apiviz.uses HierarchicalClusteringAlgorithm + * @apiviz.uses PointerHierarchyRepresentationResult + * @apiviz.has Clustering + */ +public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implements ClusteringAlgorithm<Clustering<DendrogramModel<D>>> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(ExtractFlatClusteringFromHierarchy.class); + + /** + * Threshold mode. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static enum ThresholdMode { + /** Cut by minimum number of clusters */ + BY_MINCLUSTERS, + /** Cut by threshold */ + BY_THRESHOLD, + /** No thresholding */ + NO_THRESHOLD, + } + + /** + * Output mode. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static enum OutputMode { + /** Strict partitioning. */ + STRICT_PARTITIONS, + /** Partial hierarchy. */ + PARTIAL_HIERARCHY, + } + + /** + * Minimum number of clusters to extract + */ + private final int minclusters; + + /** + * Clustering algorithm to run to obtain the hierarchy. + */ + private HierarchicalClusteringAlgorithm<D> algorithm; + + /** + * Include empty cluster in the hierarchy produced. + */ + private OutputMode outputmode = OutputMode.PARTIAL_HIERARCHY; + + /** + * Threshold for extracting clusters. + */ + private D threshold = null; + + /** + * Disallow singleton clusters, but add them to the parent cluster instead. + */ + private boolean singletons = false; + + /** + * Constructor. + * + * @param algorithm Algorithm to run + * @param minclusters Minimum number of clusters + * @param outputmode Output mode: truncated hierarchy or strict partitions. + * @param singletons Allow producing singleton clusters. + */ + public ExtractFlatClusteringFromHierarchy(HierarchicalClusteringAlgorithm<D> algorithm, int minclusters, OutputMode outputmode, boolean singletons) { + super(); + this.algorithm = algorithm; + this.threshold = null; + this.minclusters = minclusters; + this.outputmode = outputmode; + this.singletons = singletons; + } + + /** + * Constructor. + * + * @param algorithm Algorithm to run + * @param threshold Distance threshold + * @param outputmode Output mode: truncated hierarchy or strict partitions. + * @param singletons Allow producing singleton clusters. + */ + public ExtractFlatClusteringFromHierarchy(HierarchicalClusteringAlgorithm<D> algorithm, D threshold, OutputMode outputmode, boolean singletons) { + super(); + this.algorithm = algorithm; + this.threshold = threshold; + this.minclusters = -1; + this.outputmode = outputmode; + this.singletons = singletons; + } + + @Override + public Clustering<DendrogramModel<D>> run(Database database) { + PointerHierarchyRepresentationResult<D> pointerresult = algorithm.run(database); + DBIDs ids = pointerresult.getDBIDs(); + DBIDDataStore pi = pointerresult.getParentStore(); + DataStore<D> lambda = pointerresult.getParentDistanceStore(); + + Clustering<DendrogramModel<D>> result; + if (lambda instanceof DoubleDistanceDataStore) { + result = extractClustersDouble(ids, pi, (DoubleDistanceDataStore) lambda); + } else { + result = extractClusters(ids, pi, lambda); + } + result.addChildResult(pointerresult); + + return result; + } + + /** + * Extract all clusters from the pi-lambda-representation. + * + * @param ids Object ids to process + * @param pi Pi store + * @param lambda Lambda store + * + * @return Hierarchical clustering + */ + private Clustering<DendrogramModel<D>> extractClusters(DBIDs ids, final DBIDDataStore pi, final DataStore<D> lambda) { + FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), LOG) : null; + + // Sort DBIDs by lambda. We need this for two things: + // a) to determine the stop distance from "minclusters" parameter + // b) to process arrows in decreasing / increasing order + ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); + order.sort(new CompareByLambda<>(lambda)); + DBIDArrayIter it = order.iter(); // Used multiple times! + + int split; + if (minclusters > 0) { + split = Math.max(ids.size() - minclusters, 0); + // Stop distance: + final D stopdist = lambda.get(order.get(split)); + + // Tie handling: decrement split. + while (split > 0) { + it.seek(split - 1); + if (stopdist.compareTo(lambda.get(it)) <= 0) { + split--; + } else { + break; + } + } + } else if (threshold != null) { + split = ids.size(); + it.seek(split - 1); + while (threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) { + split--; + it.retract(); + } + } else { // full hierarchy + split = 0; + } + + // Extract the child clusters + int expcnum = ids.size() - split; + WritableIntegerDataStore cluster_map = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -1); + ArrayList<ModifiableDBIDs> cluster_dbids = new ArrayList<>(expcnum); + ArrayList<D> cluster_dist = new ArrayList<>(expcnum); + ArrayModifiableDBIDs cluster_leads = DBIDUtil.newArray(expcnum); + + DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. + // Go backwards on the lower part. + for (it.seek(split - 1); it.valid(); it.retract()) { + D dist = lambda.get(it); // Distance to successor + pi.assignVar(it, succ); // succ = pi(it) + int clusterid = cluster_map.intValue(succ); + // Successor cluster has already been created: + if (clusterid >= 0) { + cluster_dbids.get(clusterid).add(it); + cluster_map.putInt(it, clusterid); + // Update distance to maximum encountered: + if (cluster_dist.get(clusterid).compareTo(dist) < 0) { + cluster_dist.set(clusterid, dist); + } + } else { + // Need to start a new cluster: + clusterid = cluster_dbids.size(); // next cluster number. + ModifiableDBIDs cids = DBIDUtil.newArray(); + // Add element and successor as initial members: + cids.add(succ); + cluster_map.putInt(succ, clusterid); + cids.add(it); + cluster_map.putInt(it, clusterid); + // Store new cluster. + cluster_dbids.add(cids); + cluster_leads.add(succ); + cluster_dist.add(dist); + } + + // Decrement counter + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + final Clustering<DendrogramModel<D>> dendrogram; + switch(outputmode) { + case PARTIAL_HIERARCHY: { + // Build a hierarchy out of these clusters. + dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); + Cluster<DendrogramModel<D>> root = null; + ArrayList<Cluster<DendrogramModel<D>>> clusters = new ArrayList<>(expcnum); + // Convert initial clusters to cluster objects + { + int i = 0; + for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); + } + cluster_dist = null; // Invalidate + cluster_dbids = null; // Invalidate + } + // Process the upper part, bottom-up. + for (it.seek(split); it.valid(); it.advance()) { + int clusterid = cluster_map.intValue(it); + // The current cluster led by the current element: + final Cluster<DendrogramModel<D>> clus; + if (clusterid >= 0) { + clus = clusters.get(clusterid); + } else if (!singletons && ids.size() != 1) { + clus = null; + } else { + clus = makeCluster(it, null, DBIDUtil.deref(it)); + } + // The successor to join: + pi.assignVar(it, succ); // succ = pi(it) + if (DBIDUtil.equal(it, succ)) { + assert (root == null); + root = clus; + } else { + // Parent cluster: + int parentid = cluster_map.intValue(succ); + D depth = lambda.get(it); + // Parent cluster exists - merge as a new cluster: + if (parentid >= 0) { + final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); + if (pclus.getModel().getDistance().equals(depth)) { + if (clus == null) { + ((ModifiableDBIDs) pclus.getIDs()).add(it); + } else { + dendrogram.addChildCluster(pclus, clus); + } + } else { + // Merge at new depth: + ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); + if (clus == null) { + cids.add(it); + } + Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); + if (clus != null) { + dendrogram.addChildCluster(npclus, clus); + } + dendrogram.addChildCluster(npclus, pclus); + // Replace existing parent cluster: new depth + clusters.set(parentid, npclus); + } + } else { + // Merge with parent at this depth: + final Cluster<DendrogramModel<D>> pclus; + if (!singletons) { + ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); + cids.add(succ); + if (clus == null) { + cids.add(it); + } + // New cluster for parent and/or new point + pclus = makeCluster(succ, depth, cids); + } else { + // Create a new, one-element cluster for parent, and a merged + // cluster on top. + pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); + dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); + } + if (clus != null) { + dendrogram.addChildCluster(pclus, clus); + } + // Store cluster: + parentid = clusters.size(); + clusters.add(pclus); // Remember parent cluster + cluster_map.putInt(succ, parentid); // Reference + } + } + + // Decrement counter + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + assert (root != null); + // attach root + dendrogram.addToplevelCluster(root); + break; + } + case STRICT_PARTITIONS: { + // Build a hierarchy out of these clusters. + dendrogram = new Clustering<>("Flattened Hierarchical Clustering", "flattened-hierarchical-clustering"); + // Convert initial clusters to cluster objects + { + int i = 0; + for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + dendrogram.addToplevelCluster(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); + } + cluster_dist = null; // Invalidate + cluster_dbids = null; // Invalidate + } + // Process the upper part, bottom-up. + for (it.seek(split); it.valid(); it.advance()) { + int clusterid = cluster_map.intValue(it); + if (clusterid < 0) { + dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); + } + + // Decrement counter + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + break; + } + default: + throw new AbortException("Unsupported output mode."); + } + + if (progress != null) { + progress.ensureCompleted(LOG); + } + + return dendrogram; + } + + /** + * Extract all clusters from the pi-lambda-representation. + * + * @param ids Object ids to process + * @param pi Pi store + * @param lambda Lambda store + * + * @return Hierarchical clustering + */ + private Clustering<DendrogramModel<D>> extractClustersDouble(DBIDs ids, final DBIDDataStore pi, final DoubleDistanceDataStore lambda) { + FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), LOG) : null; + + // Sort DBIDs by lambda. We need this for two things: + // a) to determine the stop distance from "minclusters" parameter + // b) to process arrows in decreasing / increasing order + ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); + order.sort(new CompareByDoubleLambda(lambda)); + DBIDArrayIter it = order.iter(); // Used multiple times! + + int split; + if (minclusters > 0) { + split = Math.max(ids.size() - minclusters, 0); + // Stop distance: + final double stopdist = lambda.doubleValue(order.get(split)); + + // Tie handling: decrement split. + while (split > 0) { + it.seek(split - 1); + if (stopdist <= lambda.doubleValue(it)) { + split--; + } else { + break; + } + } + } else if (threshold != null) { + split = ids.size(); + it.seek(split - 1); + double stopdist = ((DoubleDistance) threshold).doubleValue(); + while (stopdist <= lambda.doubleValue(it) && it.valid()) { + split--; + it.retract(); + } + } else { // full hierarchy + split = 0; + } + + // Extract the child clusters + int expcnum = ids.size() - split; + WritableIntegerDataStore cluster_map = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -1); + ArrayList<ModifiableDBIDs> cluster_dbids = new ArrayList<>(expcnum); + TDoubleArrayList cluster_dist = new TDoubleArrayList(expcnum); + ArrayModifiableDBIDs cluster_leads = DBIDUtil.newArray(expcnum); + + DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. + // Go backwards on the lower part. + for (it.seek(split - 1); it.valid(); it.retract()) { + double dist = lambda.doubleValue(it); // Distance to successor + pi.assignVar(it, succ); // succ = pi(it) + int clusterid = cluster_map.intValue(succ); + // Successor cluster has already been created: + if (clusterid >= 0) { + cluster_dbids.get(clusterid).add(it); + cluster_map.putInt(it, clusterid); + // Update distance to maximum encountered: + if (cluster_dist.get(clusterid) < dist) { + cluster_dist.set(clusterid, dist); + } + } else { + // Need to start a new cluster: + clusterid = cluster_dbids.size(); // next cluster number. + ModifiableDBIDs cids = DBIDUtil.newArray(); + // Add element and successor as initial members: + cids.add(succ); + cluster_map.putInt(succ, clusterid); + cids.add(it); + cluster_map.putInt(it, clusterid); + // Store new cluster. + cluster_dbids.add(cids); + cluster_leads.add(succ); + cluster_dist.add(dist); + } + + // Decrement counter + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + final Clustering<DendrogramModel<D>> dendrogram; + switch(outputmode) { + case PARTIAL_HIERARCHY: { + // Build a hierarchy out of these clusters. + dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); + Cluster<DendrogramModel<D>> root = null; + ArrayList<Cluster<DendrogramModel<D>>> clusters = new ArrayList<>(expcnum); + // Convert initial clusters to cluster objects + { + int i = 0; + for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + @SuppressWarnings("unchecked") + D depth = (D) new DoubleDistance(cluster_dist.get(i)); + clusters.add(makeCluster(it2, depth, cluster_dbids.get(i))); + } + cluster_dist = null; // Invalidate + cluster_dbids = null; // Invalidate + } + // Process the upper part, bottom-up. + for (it.seek(split); it.valid(); it.advance()) { + int clusterid = cluster_map.intValue(it); + // The current cluster led by the current element: + final Cluster<DendrogramModel<D>> clus; + if (clusterid >= 0) { + clus = clusters.get(clusterid); + } else if (!singletons && ids.size() != 1) { + clus = null; + } else { + clus = makeCluster(it, null, DBIDUtil.deref(it)); + } + // The successor to join: + pi.assignVar(it, succ); // succ = pi(it) + if (DBIDUtil.equal(it, succ)) { + assert (root == null); + root = clus; + } else { + // Parent cluster: + int parentid = cluster_map.intValue(succ); + @SuppressWarnings("unchecked") + D depth = (D) new DoubleDistance(lambda.doubleValue(it)); + // Parent cluster exists - merge as a new cluster: + if (parentid >= 0) { + final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); + if (pclus.getModel().getDistance().equals(depth)) { + if (clus == null) { + ((ModifiableDBIDs) pclus.getIDs()).add(it); + } else { + dendrogram.addChildCluster(pclus, clus); + } + } else { + // Merge at new depth: + ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); + if (clus == null) { + cids.add(it); + } + Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); + if (clus != null) { + dendrogram.addChildCluster(npclus, clus); + } + dendrogram.addChildCluster(npclus, pclus); + // Replace existing parent cluster: new depth + clusters.set(parentid, npclus); + } + } else { + // Merge with parent at this depth: + final Cluster<DendrogramModel<D>> pclus; + if (!singletons) { + ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); + cids.add(succ); + if (clus == null) { + cids.add(it); + } + // New cluster for parent and/or new point + pclus = makeCluster(succ, depth, cids); + } else { + // Create a new, one-element cluster for parent, and a merged + // cluster on top. + pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); + dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); + } + if (clus != null) { + dendrogram.addChildCluster(pclus, clus); + } + // Store cluster: + parentid = clusters.size(); + clusters.add(pclus); // Remember parent cluster + cluster_map.putInt(succ, parentid); // Reference + } + } + + // Decrement counter + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + assert (root != null); + // attach root + dendrogram.addToplevelCluster(root); + break; + } + case STRICT_PARTITIONS: { + // Build a hierarchy out of these clusters. + dendrogram = new Clustering<>("Flattened Hierarchical Clustering", "flattened-hierarchical-clustering"); + // Convert initial clusters to cluster objects + { + int i = 0; + for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { + @SuppressWarnings("unchecked") + D depth = (D) new DoubleDistance(cluster_dist.get(i)); + dendrogram.addToplevelCluster(makeCluster(it2, depth, cluster_dbids.get(i))); + } + cluster_dist = null; // Invalidate + cluster_dbids = null; // Invalidate + } + // Process the upper part, bottom-up. + for (it.seek(split); it.valid(); it.advance()) { + int clusterid = cluster_map.intValue(it); + if (clusterid < 0) { + dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); + } + + // Decrement counter + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + break; + } + default: + throw new AbortException("Unsupported output mode."); + } + + if (progress != null) { + progress.ensureCompleted(LOG); + } + + return dendrogram; + } + + /** + * Make the cluster for the given object + * + * @param lead Leading object + * @param depth Linkage depth + * @param members Member objects + * @return Cluster + */ + private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members) { + final String name; + if (members.size() == 0) { + name = "mrg_" + DBIDUtil.toString(lead) + "_" + depth; + } else if (depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) { + name = "obj_" + DBIDUtil.toString(lead); + } else if (depth != null) { + name = "clu_" + DBIDUtil.toString(lead) + "_" + depth; + } else { + // Complete data set only? + name = "clu_" + DBIDUtil.toString(lead); + } + Cluster<DendrogramModel<D>> cluster = new Cluster<>(name, members, new DendrogramModel<>(depth)); + return cluster; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return algorithm.getInputTypeRestriction(); + } + + /** + * Order a DBID collection by the lambda value. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <D> Distance type + */ + private static final class CompareByLambda<D extends Distance<D>> implements Comparator<DBIDRef> { + /** + * Lambda storage + */ + private final DataStore<D> lambda; + + /** + * Constructor. + * + * @param lambda Lambda storage + */ + protected CompareByLambda(DataStore<D> lambda) { + this.lambda = lambda; + } + + @Override + public int compare(DBIDRef id1, DBIDRef id2) { + D k1 = lambda.get(id1); + D k2 = lambda.get(id2); + assert (k1 != null); + assert (k2 != null); + return k1.compareTo(k2); + } + } + + /** + * Order a DBID collection by the lambda value. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private static final class CompareByDoubleLambda implements Comparator<DBIDRef> { + /** + * Lambda storage + */ + private final DoubleDistanceDataStore lambda; + + /** + * Constructor. + * + * @param lambda Lambda storage + */ + protected CompareByDoubleLambda(DoubleDistanceDataStore lambda) { + this.lambda = lambda; + } + + @Override + public int compare(DBIDRef id1, DBIDRef id2) { + double k1 = lambda.doubleValue(id1); + double k2 = lambda.doubleValue(id2); + return Double.compare(k1, k2); + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<D extends Distance<D>> extends AbstractParameterizer { + /** + * Extraction mode to use. + */ + public static final OptionID MODE_ID = new OptionID("hierarchical.threshold-mode", "The thresholding mode to use for extracting clusters: by desired number of clusters, or by distance threshold."); + + /** + * The minimum number of clusters to extract. + */ + public static final OptionID MINCLUSTERS_ID = new OptionID("hierarchical.minclusters", "The minimum number of clusters to extract (there may be more clusters when tied)."); + + /** + * The threshold level for which to extract the clustering. + */ + public static final OptionID THRESHOLD_ID = new OptionID("hierarchical.threshold", "The threshold level for which to extract the clusters."); + + /** + * Parameter to configure the output mode (nested or truncated clusters). + */ + public static final OptionID OUTPUTMODE_ID = new OptionID("hierarchical.output-mode", "The output mode: a truncated cluster hierarchy, or a strict (flat) partitioning of the data set."); + + /** + * Flag to produce singleton clusters. + */ + public static final OptionID SINGLETONS_ID = new OptionID("hierarchical.singletons", "Do not avoid singleton clusters. This produces a more complex hierarchy."); + + /** + * Number of clusters to extract. + */ + int minclusters = -1; + + /** + * Threshold level. + */ + D threshold = null; + + /** + * Flag to produce empty clusters to model the hierarchy above. + */ + OutputMode outputmode = null; + + /** + * The hierarchical clustering algorithm to run. + */ + HierarchicalClusteringAlgorithm<D> algorithm; + + /** + * Threshold mode. + */ + ThresholdMode thresholdmode = null; + + /** + * Also create singleton clusters. + */ + boolean singletons = false; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<HierarchicalClusteringAlgorithm<D>> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class); + if (config.grab(algorithmP)) { + algorithm = algorithmP.instantiateClass(config); + } + + EnumParameter<ThresholdMode> modeP = new EnumParameter<>(MODE_ID, ThresholdMode.class, ThresholdMode.BY_MINCLUSTERS); + if (config.grab(modeP)) { + thresholdmode = modeP.getValue(); + } + + if (thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) { + IntParameter minclustersP = new IntParameter(MINCLUSTERS_ID); + minclustersP.addConstraint(new GreaterEqualConstraint(1)); + if (config.grab(minclustersP)) { + minclusters = minclustersP.intValue(); + } + } + + if (thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) { + // Fallback to double when no algorithm chosen yet: + @SuppressWarnings("unchecked") + final D factory = algorithm != null ? algorithm.getDistanceFactory() : (D) DoubleDistance.FACTORY; + DistanceParameter<D> distP = new DistanceParameter<>(THRESHOLD_ID, factory); + if (config.grab(distP)) { + threshold = distP.getValue(); + } + } + + if (thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) { + EnumParameter<OutputMode> outputP = new EnumParameter<>(OUTPUTMODE_ID, OutputMode.class); + if (config.grab(outputP)) { + outputmode = outputP.getValue(); + } + } else { + // This becomes full hierarchy: + minclusters = -1; + outputmode = OutputMode.PARTIAL_HIERARCHY; + } + + Flag singletonsF = new Flag(SINGLETONS_ID); + if (config.grab(singletonsF)) { + singletons = singletonsF.isTrue(); + } + } + + @Override + protected ExtractFlatClusteringFromHierarchy<D> makeInstance() { + switch(thresholdmode) { + case NO_THRESHOLD: + case BY_MINCLUSTERS: + return new ExtractFlatClusteringFromHierarchy<>(algorithm, minclusters, outputmode, singletons); + case BY_THRESHOLD: + return new ExtractFlatClusteringFromHierarchy<>(algorithm, threshold, outputmode, singletons); + default: + throw new AbortException("Unknown extraction mode."); + } + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java new file mode 100644 index 00000000..079fb69b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java @@ -0,0 +1,82 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Group-average linkage clustering method. + * + * Reference: + * <p> + * A. K. Jain and R. C. Dubes<br /> + * Algorithms for Clustering Data<br /> + * Prentice-Hall + * </p> + * + * @author Erich Schubert + */ +@Alias({ "upgma", "average", "average-link", "average-linkage", "UPGMA" }) +@Reference(authors = "A. K. Jain and R. C. Dubes", title = "Algorithms for Clustering Data", booktitle = "Algorithms for Clustering Data, Prentice-Hall") +public class GroupAverageLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final GroupAverageLinkageMethod STATIC = new GroupAverageLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public GroupAverageLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + final double wx = sizex / (double) (sizex + sizey); + final double wy = sizey / (double) (sizex + sizey); + return wx * dx + wy * dy; + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected GroupAverageLinkageMethod makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java new file mode 100644 index 00000000..f3595d51 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java @@ -0,0 +1,51 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.algorithm.Algorithm; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; + +/** + * Interface for hierarchical clustering algorithms. + * + * This interface allows the algorithms to be used by e.g. + * {@link ExtractFlatClusteringFromHierarchy}. + * + * @author Erich Schubert + * + * @apiviz.has PointerHierarchyRepresentationResult + * + * @param <D> Distance type + */ +public interface HierarchicalClusteringAlgorithm<D extends Distance<D>> extends Algorithm { + @Override + public PointerHierarchyRepresentationResult<D> run(Database db); + + /** + * Return the distance type that will be used by the algorithm. + * + * @return Distance factory. + */ + public D getDistanceFactory(); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java new file mode 100644 index 00000000..68d0b4d8 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java @@ -0,0 +1,56 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * Abstract interface for implementing a new linkage method into hierarchical + * clustering. + * + * Reference: + * <p> + * G. N. Lance and W. T. Williams<br /> + * A general theory of classificatory sorting strategies 1. Hierarchical systems + * <br/> + * The computer journal 9.4 (1967): 373-380. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "G. N. Lance and W. T. Williams", title = "A general theory of classificatory sorting strategies 1. Hierarchical systems", booktitle = "The computer journal 9.4", url = "http://dx.doi.org/ 10.1093/comjnl/9.4.373") +public interface LinkageMethod { + /** + * Compute combined linkage for two clusters. + * + * @param sizex Size of first cluster x before merging + * @param dx Distance of cluster x to j before merging + * @param sizey Size of second cluster y before merging + * @param dy Distance of cluster y to j before merging + * @param sizej Size of candidate cluster j + * @param dxy Distance between clusters x and y before merging + * @return Combined distance + */ + double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java new file mode 100644 index 00000000..fe167cec --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java @@ -0,0 +1,80 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Median-linkage clustering method: Weighted pair group method using centroids + * (WPGMC). + * + * Reference: + * <p> + * J.C. Gower<br/> + * A comparison of some methods of cluster analysis<br/> + * Biometrics (1967): 623-637. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "J. C. Gower", title = "A comparison of some methods of cluster analysis", booktitle = "Biometrics (1967)", url = "http://www.jstor.org/stable/10.2307/2528417") +@Alias({ "wpgmc", "WPGMC", "weighted-centroid" }) +public class MedianLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final MedianLinkageMethod STATIC = new MedianLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public MedianLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + return .5 * (dx + dy) - .25 * dxy; + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected MedianLinkageMethod makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java new file mode 100644 index 00000000..ee3052a4 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java @@ -0,0 +1,303 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * This tutorial will step you through implementing a well known clustering + * algorithm, agglomerative hierarchical clustering, in multiple steps. + * + * This is the third step, where we add support for different linkage + * strategies. + * + * This is the naive O(n^3) algorithm. See {@link SLINK} for a much faster + * algorithm (however, only for single-linkage). + * + * Reference for the unified concept: + * <p> + * G. N. Lance and W. T. Williams<br /> + * A general theory of classificatory sorting strategies 1. Hierarchical systems + * <br/> + * The computer journal 9.4 (1967): 373-380. + * </p> + * + * See also: + * <p> + * A Review of Classification<br /> + * R. M. Cormack<br /> + * Journal of the Royal Statistical Society. Series A, Vol. 134, No. 3 + * </p> + * + * @author Erich Schubert + * + * @apiviz.composedOf LinkageMethod + * + * @param <O> Object type + */ +@Reference(authors = "G. N. Lance and W. T. Williams", title = "A general theory of classificatory sorting strategies 1. Hierarchical systems", booktitle = "The computer journal 9.4", url = "http://dx.doi.org/ 10.1093/comjnl/9.4.373") +public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, PointerHierarchyRepresentationResult<DoubleDistance>> implements HierarchicalClusteringAlgorithm<DoubleDistance> { + /** + * Class logger + */ + private static final Logging LOG = Logging.getLogger(NaiveAgglomerativeHierarchicalClustering.class); + + /** + * Current linkage method in use. + */ + LinkageMethod linkage = WardLinkageMethod.STATIC; + + /** + * Constructor. + * + * @param distanceFunction Distance function to use + * @param linkage Linkage method + */ + public NaiveAgglomerativeHierarchicalClustering(DistanceFunction<? super O, D> distanceFunction, LinkageMethod linkage) { + super(distanceFunction); + this.linkage = linkage; + } + + /** + * Run the algorithm + * + * @param db Database + * @param relation Relation + * @return Clustering hierarchy + */ + public PointerHierarchyRepresentationResult<DoubleDistance> run(Database db, Relation<O> relation) { + DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction()); + ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); + final int size = ids.size(); + + if (size > 0x10000) { + throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); + } + if (SingleLinkageMethod.class.isInstance(linkage)) { + LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); + } + + // Compute the initial (lower triangular) distance matrix. + double[] scratch = new double[triangleSize(size)]; + DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter(); + // Position counter - must agree with computeOffset! + int pos = 0; + boolean square = WardLinkageMethod.class.isInstance(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); + for (ix.seek(0); ix.valid(); ix.advance()) { + for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { + scratch[pos] = dq.distance(ix, iy).doubleValue(); + // Ward uses variances -- i.e. squared values + if (square) { + scratch[pos] *= scratch[pos]; + } + pos++; + } + } + + // Initialize space for result: + WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); + WritableDoubleDistanceDataStore lambda = DataStoreUtil.makeDoubleDistanceStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); + WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + pi.put(it, it); + lambda.put(it, Double.POSITIVE_INFINITY); + csize.put(it, 1); + } + + // Repeat until everything merged into 1 cluster + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; + for (int i = 1; i < size; i++) { + double mindist = Double.POSITIVE_INFINITY; + int x = -1, y = -1; + for (ix.seek(0); ix.valid(); ix.advance()) { + if (lambda.doubleValue(ix) < Double.POSITIVE_INFINITY) { + continue; + } + final int xbase = triangleSize(ix.getOffset()); + for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { + if (lambda.doubleValue(iy) < Double.POSITIVE_INFINITY) { + continue; + } + final int idx = xbase + iy.getOffset(); + if (scratch[idx] <= mindist) { + mindist = scratch[idx]; + x = ix.getOffset(); + y = iy.getOffset(); + } + } + } + assert (x >= 0 && y >= 0); + // Avoid allocating memory, by reusing existing iterators: + ix.seek(x); + iy.seek(y); + if (LOG.isDebuggingFine()) { + LOG.debugFine("Merging: " + DBIDUtil.toString(ix) + " -> " + DBIDUtil.toString(iy)); + } + // Perform merge in data structure: x -> y + // Since y < x, prefer keeping y, dropping x. + lambda.put(ix, mindist); + pi.put(ix, iy); + // Merge into cluster + int sizex = csize.intValue(ix), sizey = csize.intValue(iy); + csize.put(iy, sizex + sizey); + + // Update distance matrix. Note: miny < minx + + // Implementation note: most will not need sizej, and could save the + // hashmap lookup. + final int xbase = triangleSize(x), ybase = triangleSize(y); + + ij.seek(0); + // Write to (y, j), with j < y + for (; ij.getOffset() < y; ij.advance()) { + if (lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { + continue; + } + final int sizej = csize.intValue(ij); + scratch[ybase + ij.getOffset()] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, mindist); + } + ij.advance(); // Skip y + // Write to (j, y), with y < j < x + for (; ij.getOffset() < x; ij.advance()) { + if (lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { + continue; + } + final int jbase = triangleSize(ij.getOffset()); + final int sizej = csize.intValue(ij); + scratch[jbase + y] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + y], sizej, mindist); + } + ij.advance(); // Skip x + // Write to (j, y), with y < x < j + for (; ij.valid(); ij.advance()) { + if (lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { + continue; + } + final int sizej = csize.intValue(ij); + final int jbase = triangleSize(ij.getOffset()); + scratch[jbase + y] = linkage.combine(sizex, scratch[jbase + x], sizey, scratch[jbase + y], sizej, mindist); + } + if (prog != null) { + prog.incrementProcessed(LOG); + } + } + if (prog != null) { + prog.ensureCompleted(LOG); + } + + return new PointerHierarchyRepresentationResult<>(ids, pi, lambda); + } + + /** + * Compute the size of a complete x by x triangle (minus diagonal) + * + * @param x Offset + * @return Size of complete triangle + */ + protected static int triangleSize(int x) { + return (x * (x - 1)) >>> 1; + } + + @Override + public DoubleDistance getDistanceFactory() { + return DoubleDistance.FACTORY; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + // The input relation must match our distance function: + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> Object type + * @param <D> Distance type + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Option ID for linkage parameter. + */ + public static final OptionID LINKAGE_ID = new OptionID("hierarchical.linkage", "Linkage method to use (e.g. Ward, Single-Link)"); + + /** + * Current linkage in use. + */ + protected LinkageMethod linkage; + + @Override + protected void makeOptions(Parameterization config) { + // We don't call super, because we want a different default distance. + ObjectParameter<DistanceFunction<O, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, DistanceFunction.class); + if (config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + } + + ObjectParameter<LinkageMethod> linkageP = new ObjectParameter<>(LINKAGE_ID, LinkageMethod.class); + linkageP.setDefaultValue(WardLinkageMethod.class); + if (config.grab(linkageP)) { + linkage = linkageP.instantiateClass(config); + } + } + + @Override + protected NaiveAgglomerativeHierarchicalClustering<O, D> makeInstance() { + return new NaiveAgglomerativeHierarchicalClustering<>(distanceFunction, linkage); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java new file mode 100644 index 00000000..c339fb09 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java @@ -0,0 +1,97 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.database.datastore.DBIDDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.result.BasicResult; + +/** + * The pointer representation of a hierarchical clustering. Each object is + * represented by a parent object and the distance at which it joins the parent + * objects cluster. + * + * @author Erich Schubert + * + * @param <D> Distance type + */ +public class PointerHierarchyRepresentationResult<D extends Distance<D>> extends BasicResult { + /** + * The DBIDs in this result. + */ + DBIDs ids; + + /** + * The parent DBID relation. + */ + DBIDDataStore parent; + + /** + * Distance to the parent object. + */ + DataStore<D> parentDistance; + + /** + * Constructor. + * + * @param ids IDs processed. + * @param parent Parent pointer. + * @param parentDistance Distance to parent. + */ + public PointerHierarchyRepresentationResult(DBIDs ids, DBIDDataStore parent, DataStore<D> parentDistance) { + super("Pointer Representation", "pointer-representation"); + this.ids = ids; + this.parent = parent; + this.parentDistance = parentDistance; + } + + /** + * Get the clustered DBIDs. + * + * @return DBIDs + */ + public DBIDs getDBIDs() { + return ids; + } + + /** + * Get the parent DBID relation. + * + * @return Parent relation. + */ + public DBIDDataStore getParentStore() { + return parent; + } + + /** + * Get the distance to the parent. + * + * @return Parent distance. + */ + public DataStore<D> getParentDistanceStore() { + return parentDistance; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java new file mode 100644 index 00000000..f1b58868 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java @@ -0,0 +1,368 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.DistanceUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; + +/** + * Implementation of the efficient Single-Link Algorithm SLINK of R. Sibson. + * + * <p> + * Reference:<br /> + * R. Sibson: SLINK: An optimally efficient algorithm for the single-link + * cluster method. <br/> + * In: The Computer Journal 16 (1973), No. 1, p. 30-34. + * </p> + * + * @author Elke Achtert + * @author Erich Schubert + * + * @apiviz.has SingleLinkageMethod + * + * @param <O> the type of DatabaseObject the algorithm is applied on + * @param <D> the type of Distance used + */ +@Title("SLINK: Single Link Clustering") +@Description("Hierarchical clustering algorithm based on single-link connectivity.") +@Reference(authors = "R. Sibson", title = "SLINK: An optimally efficient algorithm for the single-link cluster method", booktitle = "The Computer Journal 16 (1973), No. 1, p. 30-34.", url = "http://dx.doi.org/10.1093/comjnl/16.1.30") +@Alias(value = { "de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK", "clustering.SLINK", "SLINK", "single-link", "single-linkage" }) +public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<O, D, PointerHierarchyRepresentationResult<D>> implements HierarchicalClusteringAlgorithm<D> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(SLINK.class); + + /** + * Constructor. + * + * @param distanceFunction Distance function + */ + public SLINK(DistanceFunction<? super O, D> distanceFunction) { + super(distanceFunction); + } + + /** + * Performs the SLINK algorithm on the given database. + */ + public PointerHierarchyRepresentationResult<D> run(Database database, Relation<O> relation) { + DBIDs ids = relation.getDBIDs(); + DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); + @SuppressWarnings("unchecked") + Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass(); + WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); + WritableDataStore<D> lambda = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, distCls); + // Temporary storage for m. + WritableDataStore<D> m = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); + + FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running SLINK", ids.size(), LOG) : null; + // has to be an array for monotonicity reasons! + ModifiableDBIDs processedIDs = DBIDUtil.newArray(ids.size()); + + // Optimized code path for double distances + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && lambda instanceof WritableDoubleDistanceDataStore && m instanceof WritableDoubleDistanceDataStore) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<? super O> dist = (PrimitiveDoubleDistanceFunction<? super O>) getDistanceFunction(); + WritableDoubleDistanceDataStore lambdad = (WritableDoubleDistanceDataStore) lambda; + WritableDoubleDistanceDataStore md = (WritableDoubleDistanceDataStore) m; + // apply the algorithm + for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { + step1double(id, pi, lambdad); + step2double(id, processedIDs, distQuery.getRelation(), dist, md); + step3double(id, pi, lambdad, processedIDs, md); + step4double(id, pi, lambdad, processedIDs); + + processedIDs.add(id); + + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + } else { + // apply the algorithm + for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { + step1(id, pi, lambda); + step2(id, processedIDs, distQuery, m); + step3(id, pi, lambda, processedIDs, m); + step4(id, pi, lambda, processedIDs); + + processedIDs.add(id); + + if (progress != null) { + progress.incrementProcessed(LOG); + } + } + } + + if (progress != null) { + progress.ensureCompleted(LOG); + } + // We don't need m anymore. + m.destroy(); + m = null; + + return new PointerHierarchyRepresentationResult<>(ids, pi, lambda); + } + + /** + * First step: Initialize P(id) = id, L(id) = infinity. + * + * @param id the id of the object to be inserted into the pointer + * representation + * @param pi Pi data store + * @param lambda Lambda data store + */ + private void step1(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda) { + // P(n+1) = n+1: + pi.put(id, id); + // L(n+1) = infinity + lambda.put(id, getDistanceFunction().getDistanceFactory().infiniteDistance()); + } + + /** + * Second step: Determine the pairwise distances from all objects in the + * pointer representation to the new object with the specified id. + * + * @param id the id of the object to be inserted into the pointer + * representation + * @param processedIDs the already processed ids + * @param m Data store + * @param distFunc Distance function to use + */ + private void step2(DBIDRef id, DBIDs processedIDs, DistanceQuery<O, D> distFunc, WritableDataStore<D> m) { + O newObj = distFunc.getRelation().get(id); + for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + // M(i) = dist(i, n+1) + m.put(it, distFunc.distance(it, newObj)); + } + } + + /** + * Third step: Determine the values for P and L + * + * @param id the id of the object to be inserted into the pointer + * representation + * @param pi Pi data store + * @param lambda Lambda data store + * @param processedIDs the already processed ids + * @param m Data store + */ + private void step3(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda, DBIDs processedIDs, WritableDataStore<D> m) { + DBIDVar p_i = DBIDUtil.newVar(); + // for i = 1..n + for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + D l_i = lambda.get(it); + D m_i = m.get(it); + pi.assignVar(it, p_i); // p_i = pi(it) + D mp_i = m.get(p_i); + + // if L(i) >= M(i) + if (l_i.compareTo(m_i) >= 0) { + // M(P(i)) = min { M(P(i)), L(i) } + m.put(p_i, DistanceUtil.min(mp_i, l_i)); + + // L(i) = M(i) + lambda.put(it, m_i); + + // P(i) = n+1; + pi.put(it, id); + } else { + // M(P(i)) = min { M(P(i)), M(i) } + m.put(p_i, DistanceUtil.min(mp_i, m_i)); + } + } + } + + /** + * Fourth step: Actualize the clusters if necessary + * + * @param id the id of the current object + * @param pi Pi data store + * @param lambda Lambda data store + * @param processedIDs the already processed ids + */ + private void step4(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda, DBIDs processedIDs) { + DBIDVar p_i = DBIDUtil.newVar(); + // for i = 1..n + for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + D l_i = lambda.get(it); + pi.assignVar(it, p_i); // p_i = pi(it) + D lp_i = lambda.get(p_i); + + // if L(i) >= L(P(i)) + if (l_i.compareTo(lp_i) >= 0) { + // P(i) = n+1 + pi.put(it, id); + } + } + } + + /** + * First step: Initialize P(id) = id, L(id) = infinity. + * + * @param id the id of the object to be inserted into the pointer + * representation + * @param pi Pi data store + * @param lambda Lambda data store + */ + private void step1double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda) { + // P(n+1) = n+1: + pi.put(id, id); + // L(n+1) = infinity + lambda.putDouble(id, Double.POSITIVE_INFINITY); + } + + /** + * Second step: Determine the pairwise distances from all objects in the + * pointer representation to the new object with the specified id. + * + * @param id the id of the object to be inserted into the pointer + * representation + * @param processedIDs the already processed ids + * @param m Data store + * @param relation Data relation + * @param distFunc Distance function to use + */ + private void step2double(DBIDRef id, DBIDs processedIDs, Relation<? extends O> relation, PrimitiveDoubleDistanceFunction<? super O> distFunc, WritableDoubleDistanceDataStore m) { + O newObj = relation.get(id); + for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + // M(i) = dist(i, n+1) + m.putDouble(it, distFunc.doubleDistance(relation.get(it), newObj)); + } + } + + /** + * Third step: Determine the values for P and L + * + * @param id the id of the object to be inserted into the pointer + * representation + * @param pi Pi data store + * @param lambda Lambda data store + * @param processedIDs the already processed ids + * @param m Data store + */ + private void step3double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda, DBIDs processedIDs, WritableDoubleDistanceDataStore m) { + DBIDVar p_i = DBIDUtil.newVar(); + // for i = 1..n + for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + double l_i = lambda.doubleValue(it); + double m_i = m.doubleValue(it); + pi.assignVar(it, p_i); // p_i = pi(it) + double mp_i = m.doubleValue(p_i); + + // if L(i) >= M(i) + if (l_i >= m_i) { + // M(P(i)) = min { M(P(i)), L(i) } + m.putDouble(p_i, Math.min(mp_i, l_i)); + + // L(i) = M(i) + lambda.putDouble(it, m_i); + + // P(i) = n+1; + pi.put(it, id); + } else { + // M(P(i)) = min { M(P(i)), M(i) } + m.putDouble(p_i, Math.min(mp_i, m_i)); + } + } + } + + /** + * Fourth step: Actualize the clusters if necessary + * + * @param id the id of the current object + * @param pi Pi data store + * @param lambda Lambda data store + * @param processedIDs the already processed ids + */ + private void step4double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda, DBIDs processedIDs) { + DBIDVar p_i = DBIDUtil.newVar(); + // for i = 1..n + for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + double l_i = lambda.doubleValue(it); + pi.assignVar(it, p_i); // p_i = pi(it) + double lp_i = lambda.doubleValue(p_i); + + // if L(i) >= L(P(i)) + if (l_i >= lp_i) { + // P(i) = n+1 + pi.put(it, id); + } + } + } + + @Override + public D getDistanceFactory() { + return getDistanceFunction().getDistanceFactory(); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + @Override + protected SLINK<O, D> makeInstance() { + return new SLINK<>(distanceFunction); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java new file mode 100644 index 00000000..7ef81692 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java @@ -0,0 +1,80 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Single-linkage clustering method. + * + * Reference: + * <p> + * K. Florek and J. Łukaszewicz and J. Perkal and H. Steinhaus and S. Zubrzycki<br/> + * Sur la liaison et la division des points d'un ensemble fini<br /> + * In Colloquium Mathematicae (Vol. 2, No. 3-4) + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "K. Florek and J. Łukaszewicz and J. Perkal and H. Steinhaus and S. Zubrzycki", title = "Sur la liaison et la division des points d'un ensemble fini", booktitle = "Colloquium Mathematicae (Vol. 2, No. 3-4)") +@Alias({ "single-link", "single", "slink", "nearest", "nearest-neighbor" }) +public class SingleLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final SingleLinkageMethod STATIC = new SingleLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public SingleLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + return Math.min(dx, dy); + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected SingleLinkageMethod makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java new file mode 100644 index 00000000..488f011c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java @@ -0,0 +1,86 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; + +/** + * Ward's method clustering method. + * + * This criterion minimizes variances, and makes most sense when used with + * squared Euclidean distance, see + * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction} + * + * Reference: + * <p> + * Ward Jr, Joe H.<br /> + * Hierarchical grouping to optimize an objective function<br /> + * Journal of the American statistical association 58.301 (1963): 236-244. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "J. H. Ward Jr", title = "Hierarchical grouping to optimize an objective function", booktitle = "Journal of the American statistical association 58.301", url = "http://dx.doi.org/10.1080/01621459.1963.10500845") +@Alias({ "ward", "variance" }) +public class WardLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final WardLinkageMethod STATIC = new WardLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public WardLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + final double wx = (sizex + sizej) / (double) (sizex + sizey + sizej); + final double wy = (sizey + sizej) / (double) (sizex + sizey + sizej); + final double beta = sizej / (double) (sizex + sizey + sizej); + return wx * dx + wy * dy - beta * dxy; + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected WardLinkageMethod makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java new file mode 100644 index 00000000..ac0b17f5 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java @@ -0,0 +1,84 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Weighted average linkage clustering method. + * + * This is somewhat a misnomer, as it actually ignores that the clusters should + * likely be weighted differently according to their size when computing the + * average linkage. See {@link GroupAverageLinkageMethod} for the UPGMA method + * that uses the group size to weight the objects the same way. + * + * Reference: + * <p> + * A. K. Jain and R. C. Dubes<br /> + * Algorithms for Clustering Data<br /> + * Prentice-Hall + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "A. K. Jain and R. C. Dubes", title = "Algorithms for Clustering Data", booktitle = "Algorithms for Clustering Data, Prentice-Hall") +@Alias({ "wpgma", "WPGMA" }) +public class WeightedAverageLinkageMethod implements LinkageMethod { + /** + * Static instance of class. + */ + public static final WeightedAverageLinkageMethod STATIC = new WeightedAverageLinkageMethod(); + + /** + * Constructor. + * + * @deprecated use the static instance {@link #STATIC} instead. + */ + @Deprecated + public WeightedAverageLinkageMethod() { + super(); + } + + @Override + public double combine(int sizex, double dx, int sizey, double dy, int sizej, double dxy) { + return .5 * (dx + dy); + } + + /** + * Class parameterizer. + * + * Returns the static instance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected WeightedAverageLinkageMethod makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java index 47855aad..dc1fa47c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -43,9 +43,17 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Abstract base class for k-means implementations. @@ -59,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect; * @param <D> Distance type * @param <M> Cluster model type */ -public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans, ClusteringAlgorithm<Clustering<M>> { +public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractPrimitiveDistanceBasedAlgorithm<NumberVector<?>, D, Clustering<M>> implements KMeans<V, D, M>, ClusteringAlgorithm<Clustering<M>> { /** * Holds the value of {@link #K_ID}. */ @@ -102,54 +110,53 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) { boolean changed = false; - if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if(dist < mindist) { + if (dist < mindist) { minIndex = i; mindist = dist; } } - if(clusters.get(minIndex).add(iditer)) { + if (clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { + for (int i = 0; i < k; i++) { + if (i != minIndex) { + if (clusters.get(i).remove(iditer)) { break; } } } } } - } - else { + } else { final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if(dist.compareTo(mindist) < 0) { + if (dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } - if(clusters.get(minIndex).add(iditer)) { + if (clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { + for (int i = 0; i < k; i++) { + if (i != minIndex) { + if (clusters.get(i).remove(iditer)) { break; } } @@ -174,21 +181,24 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @return the mean vectors of the given clusters in the given database */ protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) { - List<Vector> newMeans = new ArrayList<Vector>(k); - for(int i = 0; i < k; i++) { + List<Vector> newMeans = new ArrayList<>(k); + for (int i = 0; i < k; i++) { ModifiableDBIDs list = clusters.get(i); Vector mean = null; - if(list.size() > 0) { + if (list.size() > 0) { double s = 1.0 / list.size(); DBIDIter iter = list.iter(); assert (iter.valid()); mean = database.get(iter).getColumnVector().timesEquals(s); + double[] raw = mean.getArrayRef(); iter.advance(); - for(; iter.valid(); iter.advance()) { - mean.plusTimesEquals(database.get(iter).getColumnVector(), s); + for (; iter.valid(); iter.advance()) { + NumberVector<?> vec = database.get(iter); + for (int j = 0; j < mean.getDimensionality(); j++) { + raw[j] += s * vec.doubleValue(j); + } } - } - else { + } else { mean = means.get(i).getColumnVector(); } newMeans.add(mean); @@ -207,19 +217,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan protected List<NumberVector<?>> medians(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> medians, Relation<V> database) { final int dim = medians.get(0).getDimensionality(); final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database); - List<NumberVector<?>> newMedians = new ArrayList<NumberVector<?>>(k); - for(int i = 0; i < k; i++) { + List<NumberVector<?>> newMedians = new ArrayList<>(k); + for (int i = 0; i < k; i++) { ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i)); - if(list.size() > 0) { + if (list.size() > 0) { Vector mean = new Vector(dim); - for(int d = 0; d < dim; d++) { + for (int d = 0; d < dim; d++) { sorter.setDimension(d); DBID id = QuickSelect.median(list, sorter); mean.set(d, database.get(id).doubleValue(d)); } newMedians.add(mean); - } - else { + } else { newMedians.add((NumberVector<?>) medians.get(i)); } } @@ -235,7 +244,7 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan * @param op Cluster size change / Weight change */ protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) { - if(newsize == 0) { + if (newsize == 0) { return; // Keep old mean } Vector delta = vec.getColumnVector(); @@ -256,65 +265,62 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) { boolean changed = false; - if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { + if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) { // Raw distance function @SuppressWarnings("unchecked") final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction(); // Incremental update - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY; V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { double dist = df.doubleDistance(fv, means.get(i)); - if(dist < mindist) { + if (dist < mindist) { minIndex = i; mindist = dist; } } // Update the cluster mean incrementally: - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); - if(i == minIndex) { - if(ci.add(iditer)) { + if (i == minIndex) { + if (ci.add(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } - } - else if(ci.remove(iditer)) { + } else if (ci.remove(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } } } - } - else { + } else { // Raw distance function final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction(); // Incremental update - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { D mindist = df.getDistanceFactory().infiniteDistance(); V fv = relation.get(iditer); int minIndex = 0; - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { D dist = df.distance(fv, means.get(i)); - if(dist.compareTo(mindist) < 0) { + if (dist.compareTo(mindist) < 0) { minIndex = i; mindist = dist; } } // Update the cluster mean incrementally: - for(int i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { ModifiableDBIDs ci = clusters.get(i); - if(i == minIndex) { - if(ci.add(iditer)) { + if (i == minIndex) { + if (ci.add(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size(), +1); changed = true; } - } - else if(ci.remove(iditer)) { + } else if (ci.remove(iditer)) { incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1); changed = true; } @@ -323,4 +329,76 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan } return changed; } + + @Override + public void setK(int k) { + this.k = k; + } + + @Override + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + this.distanceFunction = distanceFunction; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public abstract static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { + /** + * k Parameter. + */ + protected int k; + + /** + * Maximum number of iterations. + */ + protected int maxiter; + + /** + * Initialization method. + */ + protected KMeansInitialization<V> initializer; + + @Override + protected void makeOptions(Parameterization config) { + ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); + if (config.grab(distanceFunctionP)) { + distanceFunction = distanceFunctionP.instantiateClass(config); + if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { + getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); + } + } + + IntParameter kP = new IntParameter(K_ID); + kP.addConstraint(new GreaterConstraint(0)); + if (config.grab(kP)) { + k = kP.getValue(); + } + + ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class); + if (config.grab(initialP)) { + initializer = initialP.instantiateClass(config); + } + + IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); + maxiterP.addConstraint(new GreaterEqualConstraint(0)); + if (config.grab(maxiterP)) { + maxiter = maxiterP.getValue(); + } + } + + /** + * Get class logger. + * + * @return Logger + */ + abstract protected Logging getLogger(); + + @Override + abstract protected AbstractKMeans<V, D, ?> makeInstance(); + } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java index 3a69c806..9e3eb478 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeansInitialization.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java new file mode 100644 index 00000000..30bb640c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java @@ -0,0 +1,219 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality.KMeansQualityMeasure; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Run K-Means multiple times, and keep the best run. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ +public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(BestOfMultipleKMeans.class); + + /** + * Number of trials to do. + */ + private int trials; + + /** + * Variant of kMeans for the bisecting step. + */ + private KMeans<V, D, M> innerkMeans; + + /** + * Quality measure which should be used. + */ + private KMeansQualityMeasure<? super V, ? super D> qualityMeasure; + + /** + * Constructor. + * + * @param trials Number of trials to do. + * @param innerkMeans K-Means variant to actually use. + * @param qualityMeasure Quality measure + */ + public BestOfMultipleKMeans(int trials, KMeans<V, D, M> innerkMeans, KMeansQualityMeasure<? super V, ? super D> qualityMeasure) { + super(); + this.trials = trials; + this.innerkMeans = innerkMeans; + this.qualityMeasure = qualityMeasure; + } + + @Override + public Clustering<M> run(Database database, Relation<V> relation) { + if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) { + throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass()); + } + final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction(); + Clustering<M> bestResult = null; + if (trials > 1) { + double bestCost = Double.POSITIVE_INFINITY; + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null; + for (int i = 0; i < trials; i++) { + Clustering<M> currentCandidate = innerkMeans.run(database, relation); + double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation); + + if (LOG.isVerbose()) { + LOG.verbose("Cost of candidate " + i + ": " + currentCost); + } + + if (currentCost < bestCost) { + bestResult = currentCandidate; + bestCost = currentCost; + } + if (prog != null) { + prog.incrementProcessed(LOG); + } + } + if (prog != null) { + prog.ensureCompleted(LOG); + } + } else { + bestResult = innerkMeans.run(database); + } + + return bestResult; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return innerkMeans.getInputTypeRestriction(); + } + + @Override + public DistanceFunction<? super V, D> getDistanceFunction() { + return innerkMeans.getDistanceFunction(); + } + + @Override + public void setK(int k) { + innerkMeans.setK(k); + } + + @Override + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + innerkMeans.setDistanceFunction(distanceFunction); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>, M extends MeanModel<V>> extends AbstractParameterizer { + /** + * Parameter to specify the iterations of the bisecting step. + */ + public static final OptionID TRIALS_ID = new OptionID("kmeans.trials", "The number of trials to run."); + + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID KMEANS_ID = new OptionID("kmeans.algorithm", "KMeans variant to run multiple times."); + + /** + * Parameter to specify the variant of quality measure. + */ + public static final OptionID QUALITYMEASURE_ID = new OptionID("kmeans.qualitymeasure", "Quality measure variant for deciding which run to keep."); + + /** + * Number of trials to perform. + */ + protected int trials; + + /** + * Variant of kMeans to use. + */ + protected KMeans<V, D, M> kMeansVariant; + + /** + * Quality measure. + */ + protected KMeansQualityMeasure<? super V, ? super D> qualityMeasure; + + @Override + protected void makeOptions(Parameterization config) { + IntParameter trialsP = new IntParameter(TRIALS_ID); + trialsP.addConstraint(new GreaterEqualConstraint(1)); + if (config.grab(trialsP)) { + trials = trialsP.intValue(); + } + + ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); + if (config.grab(kMeansVariantP)) { + kMeansVariant = kMeansVariantP.instantiateClass(config); + } + + ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class); + if (config.grab(qualityMeasureP)) { + qualityMeasure = qualityMeasureP.instantiateClass(config); + } + } + + @Override + protected BestOfMultipleKMeans<V, D, M> makeInstance() { + return new BestOfMultipleKMeans<>(trials, kMeansVariant, qualityMeasure); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java new file mode 100644 index 00000000..a018c04b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java @@ -0,0 +1,186 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; + +/** + * K-Means initialization by repeatedly choosing the farthest point. + * + * Note: this is less random than other initializations, so running multiple + * times will be more likely to return the same local minima. + * + * @author Erich Schubert + * + * @param <V> Vector type + * @param <D> Distance type + */ +public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization<V> implements KMedoidsInitialization<V> { + /** + * Discard the first vector. + */ + boolean dropfirst = true; + + /** + * Constructor. + * + * @param rnd Random generator. + * @param dropfirst Flag to discard the first vector. + */ + public FarthestPointsInitialMeans(RandomFactory rnd, boolean dropfirst) { + super(rnd); + this.dropfirst = dropfirst; + } + + @Override + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + // Get a distance query + if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); + + // Chose first mean + List<V> means = new ArrayList<>(k); + + Random random = rnd.getRandom(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + means.add(relation.get(first)); + + DBIDVar best = DBIDUtil.newVar(first); + for (int i = (dropfirst ? 0 : 1); i < k; i++) { + // Find farthest object: + double maxdist = Double.NEGATIVE_INFINITY; + for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + double dsum = 0.; + for (V ex : means) { + dsum += distQ.distance(ex, it).doubleValue(); + } + if (dsum > maxdist) { + maxdist = dsum; + best.set(it); + } + } + // Add new mean: + if (k == 0) { + means.clear(); // Remove temporary first element. + } + means.add(relation.get(best)); + } + + return means; + } + + @Override + public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { + if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) { + throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances."); + } + @SuppressWarnings("unchecked") + DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; + final Relation<?> relation = distQ.getRelation(); + // Chose first mean + ArrayModifiableDBIDs means = DBIDUtil.newArray(k); + + Random random = rnd.getRandom(); + DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter(); + means.add(first); + + DBIDVar best = DBIDUtil.newVar(first); + for (int i = (dropfirst ? 0 : 1); i < k; i++) { + // Find farthest object: + double maxdist = Double.NEGATIVE_INFINITY; + for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) { + double dsum = 0.; + for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) { + dsum += distQ.distance(ex, it).doubleValue(); + } + if (dsum > maxdist) { + maxdist = dsum; + best.set(it); + } + } + // Add new mean: + if (k == 0) { + means.clear(); // Remove temporary first element. + } + means.add(best); + } + + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { + /** + * Option ID to control the handling of the first object chosen. + */ + public static final OptionID DROPFIRST_ID = new OptionID("farthest.dropfirst", "Drop the first object chosen (which is chosen randomly) for the farthest points heuristic."); + + /** + * Flag for discarding the first object chosen. + */ + protected boolean dropfirst = true; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + Flag dropfirstP = new Flag(DROPFIRST_ID); + if (config.grab(dropfirstP)) { + dropfirst = dropfirstP.isTrue(); + } + } + + @Override + protected FarthestPointsInitialMeans<V, D> makeInstance() { + return new FarthestPointsInitialMeans<>(rnd, dropfirst); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java index 1e51f4d6..08e2f116 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FirstKInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -51,9 +52,9 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { DBIDIter iter = relation.iterDBIDs(); - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); for(int i = 0; i < k && iter.valid(); i++, iter.advance()) { means.add(relation.get(iter)); } @@ -80,7 +81,7 @@ public class FirstKInitialMeans<V> implements KMeansInitialization<V>, KMedoidsI public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { @Override protected FirstKInitialMeans<V> makeInstance() { - return new FirstKInitialMeans<V>(); + return new FirstKInitialMeans<>(); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java index 68fc4e48..29c0a5c8 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeans.java @@ -1,12 +1,10 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; - /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,12 +23,27 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; + /** * Some constants and options shared among kmeans family algorithms. * * @author Erich Schubert + * + * @param <V> Number vector type + * @param <D> Distance type + * @param <M> Actual model type */ -public interface KMeans { +public interface KMeans<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends ClusteringAlgorithm<Clustering<M>>, DistanceBasedAlgorithm<V, D> { /** * Parameter to specify the initialization method */ @@ -52,4 +65,27 @@ public interface KMeans { * Parameter to specify the random generator seed. */ public static final OptionID SEED_ID = new OptionID("kmeans.seed", "The random number generator seed."); -}
\ No newline at end of file + + /** + * Run the clustering algorithm. + * + * @param database Database to run on. + * @param rel Relation to process. + * @return Clustering result + */ + Clustering<M> run(Database database, Relation<V> rel); + + /** + * Set the value of k. Needed for some types of nested k-means. + * + * @param k K parameter + */ + void setK(int k); + + /** + * Set the distance function to use. + * + * @param distanceFunction Distance function. + */ + void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java new file mode 100644 index 00000000..37071d36 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java @@ -0,0 +1,231 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.LinkedList; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ProxyDatabase; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * The bisecting k-means algorithm works by starting with an initial + * partitioning into two clusters, then repeated splitting of the largest + * cluster to get additional clusters. + * + * Reference:<br> + * <p> + * M. Steinbach, G. Karypis, V. Kumar:<br /> + * A Comparison of Document Clustering Techniques<br /> + * KDD workshop on text mining. Vol. 400. No. 1 + * </p> + * + * @author Stephan Baier + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ +@Reference(authors = "M. Steinbach, G. Karypis, V. Kumar", title = "A Comparison of Document Clustering Techniques", booktitle = "KDD workshop on text mining. Vol. 400. No. 1") +public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractAlgorithm<Clustering<M>> implements KMeans<V, D, M> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KMeansBisecting.class); + + /** + * Variant of kMeans for the bisecting step. + */ + private KMeans<V, D, M> innerkMeans; + + /** + * Desired value of k. + */ + private int k; + + /** + * Constructor. + * + * @param k k parameter - number of result clusters + * @param innerkMeans KMeans variant parameter - for bisecting step + */ + public KMeansBisecting(int k, KMeans<V, D, M> innerkMeans) { + super(); + this.k = k; + this.innerkMeans = innerkMeans; + } + + @Override + public Clustering<M> run(Database database, Relation<V> relation) { + ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database); + + // Linked list is preferrable for scratch, as we will A) not need that many + // clusters and B) be doing random removals of the largest cluster (often at + // the head) + LinkedList<Cluster<M>> currentClusterList = new LinkedList<>(); + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null; + + for (int j = 0; j < this.k - 1; j++) { + // Choose a cluster to split and project database to cluster + if (currentClusterList.size() == 0) { + proxyDB = new ProxyDatabase(relation.getDBIDs(), database); + } else { + Cluster<M> largestCluster = null; + for (Cluster<M> cluster : currentClusterList) { + if (largestCluster == null || cluster.size() > largestCluster.size()) { + largestCluster = cluster; + } + } + currentClusterList.remove(largestCluster); + proxyDB.setDBIDs(largestCluster.getIDs()); + } + + // Run the inner k-means algorithm: + // FIXME: ensure we run on the correct relation in a multirelational + // setting! + Clustering<M> innerResult = innerkMeans.run(proxyDB); + // Add resulting clusters to current result. + currentClusterList.addAll(innerResult.getAllClusters()); + + if (prog != null) { + prog.incrementProcessed(LOG); + } + if (LOG.isVerbose()) { + LOG.verbose("Iteration " + j); + } + } + if (prog != null) { + prog.ensureCompleted(LOG); + } + + // add all current clusters to the result + Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means"); + for (Cluster<M> cluster : currentClusterList) { + result.addToplevelCluster(cluster); + } + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return innerkMeans.getInputTypeRestriction(); + } + + @Override + public DistanceFunction<? super V, D> getDistanceFunction() { + return innerkMeans.getDistanceFunction(); + } + + @Override + public void setK(int k) { + this.k = k; + } + + @Override + public void setDistanceFunction(PrimitiveDistanceFunction<? super NumberVector<?>, D> distanceFunction) { + innerkMeans.setDistanceFunction(distanceFunction); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Stephan Baier + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <D> Distance type + * @param <M> Model type + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>, M extends MeanModel<V>> extends AbstractParameterizer { + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID KMEANS_ID = new OptionID("bisecting.kmeansvariant", "KMeans variant"); + + /** + * Variant of kMeans + */ + protected KMeans<V, D, M> kMeansVariant; + + /** + * Desired number of clusters. + */ + protected int k; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + IntParameter kP = new IntParameter(KMeans.K_ID); + kP.addConstraint(new GreaterConstraint(1)); + if (config.grab(kP)) { + k = kP.intValue(); + } + + ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class, BestOfMultipleKMeans.class); + if (config.grab(kMeansVariantP)) { + ListParameterization kMeansVariantParameters = new ListParameterization(); + + // We will always invoke this with k=2! + kMeansVariantParameters.addParameter(KMeans.K_ID, 2); + + ChainedParameterization combinedConfig = new ChainedParameterization(kMeansVariantParameters, config); + combinedConfig.errorsTo(config); + kMeansVariant = kMeansVariantP.instantiateClass(combinedConfig); + } + } + + @Override + protected KMeansBisecting<V, D, M> makeInstance() { + return new KMeansBisecting<>(k, kMeansVariant); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java index 54b3a2ce..06fb10c1 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansInitialization.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,6 +24,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; */ import java.util.List; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; @@ -31,7 +33,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; * Interface for initializing K-Means * * @author Erich Schubert - * + * * @apiviz.landmark * * @param <V> Object type @@ -40,10 +42,12 @@ public interface KMeansInitialization<V> { /** * Choose initial means * + * @param database Database context * @param relation Relation * @param k Parameter k - * @param distanceFunction Distance function + * @param distanceFunction Distance function + * * @return List of chosen means for k-means */ - public abstract List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction); + public abstract List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java index f43c2277..e692293c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -36,19 +35,13 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the k-means algorithm, using Lloyd-style bulk iterations. @@ -90,28 +83,23 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten super(distanceFunction, k, maxiter, initializer); } - /** - * Run k-means. - * - * @param database Database - * @param relation relation to use - * @return result - */ + @Override public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (LOG.isVerbose()) { - LOG.verbose("K-Means iteration " + (iteration + 1)); + if (prog != null) { + prog.incrementProcessed(LOG); } boolean changed = assignToNearestCluster(relation, means, clusters); // Stop if no cluster assignment changed. @@ -121,12 +109,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten // Recompute means. means = means(clusters, means, relation); } + if (prog != null) { + prog.setCompleted(LOG); + } + // Wrap result final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { - KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); - result.addCluster(new Cluster<KMeansModel<V>>(clusters.get(i), model)); + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -143,53 +135,15 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { - /** - * k Parameter. - */ - protected int k; - - /** - * Number of iterations. - */ - protected int maxiter; - - /** - * Initialization method. - */ - protected KMeansInitialization<V> initializer; - + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { @Override - protected void makeOptions(Parameterization config) { - ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if(config.grab(distanceFunctionP)) { - distanceFunction = distanceFunctionP.instantiateClass(config); - if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { - LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); - } - } - - IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { - k = kP.getValue(); - } - - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { - initializer = initialP.instantiateClass(config); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { - maxiter = maxiterP.intValue(); - } + protected Logging getLogger() { + return LOG; } @Override protected KMeansLloyd<V, D> makeInstance() { - return new KMeansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + return new KMeansLloyd<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java index 0cc7c363..bb689bd3 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -37,20 +36,14 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the k-means algorithm, using MacQueen style incremental updates. @@ -89,24 +82,18 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex super(distanceFunction, k, maxiter, initializer); } - /** - * Run k-means. - * - * @param database Database - * @param relation relation to use - * @return Clustering result - */ + @Override public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + return new Clustering<>("k-Means Clustering", "kmeans-clustering"); } // Choose initial means - List<Vector> means = new ArrayList<Vector>(k); - for (NumberVector<?> nv : initializer.chooseInitialMeans(relation, k, getDistanceFunction())) { + List<Vector> means = new ArrayList<>(k); + for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) { means.add(nv.getColumnVector()); } // Initialize cluster and assign objects - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -114,22 +101,27 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex // Initial recomputation of the means. means = means(clusters, means, relation); + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null; // Refine result for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (LOG.isVerbose()) { - LOG.verbose("K-Means iteration " + (iteration + 1)); + if (prog != null) { + prog.incrementProcessed(LOG); } boolean changed = macQueenIterate(relation, means, clusters); if (!changed) { break; } } + if (prog != null) { + prog.setCompleted(LOG); + } + final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<KMeansModel<V>> result = new Clustering<KMeansModel<V>>("k-Means Clustering", "kmeans-clustering"); + Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering"); for (int i = 0; i < clusters.size(); i++) { DBIDs ids = clusters.get(i); - KMeansModel<V> model = new KMeansModel<V>(factory.newNumberVector(means.get(i).getArrayRef())); - result.addCluster(new Cluster<KMeansModel<V>>(ids, model)); + KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getArrayRef())); + result.addToplevelCluster(new Cluster<>(ids, model)); } return result; } @@ -146,53 +138,15 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { - /** - * k Parameter. - */ - protected int k; - - /** - * Maximum number of iterations. - */ - protected int maxiter; - - /** - * Initialization method. - */ - protected KMeansInitialization<V> initializer; - + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { @Override - protected void makeOptions(Parameterization config) { - ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); - if (config.grab(distanceFunctionP)) { - distanceFunction = distanceFunctionP.instantiateClass(config); - if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { - LOG.warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!"); - } - } - - IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { - k = kP.getValue(); - } - - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { - initializer = initialP.instantiateClass(config); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { - maxiter = maxiterP.getValue(); - } + protected Logging getLogger() { + return LOG; } @Override protected KMeansMacQueen<V, D> makeInstance() { - return new KMeansMacQueen<V, D>(distanceFunction, k, maxiter, initializer); + return new KMeansMacQueen<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java index a07953da..302ca86b 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,6 +26,8 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; @@ -70,17 +72,17 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("K-Means++ initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); // Chose first mean - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); Random random = rnd.getRandom(); DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter()); @@ -99,7 +101,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten } double r = random.nextDouble() * weightsum; int pos = 0; - while(r > 0 && pos < weights.length) { + while(r > 0 && pos < weights.length - 1) { r -= weights[pos]; pos++; } @@ -125,7 +127,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten @Override public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) { if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) { - throw new AbortException("PAM initialization can only be used with numerical distances."); + throw new AbortException("K-Means++ initialization initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") DistanceQuery<? super V, D> distQ = (DistanceQuery<? super V, D>) distQ2; @@ -244,7 +246,7 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected KMeansPlusPlusInitialMeans<V, D> makeInstance() { - return new KMeansPlusPlusInitialMeans<V, D>(rnd); + return new KMeansPlusPlusInitialMeans<>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java index 9917337e..cc7aaa9e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -39,13 +38,9 @@ import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Provides the k-medians clustering algorithm, using Lloyd-style bulk @@ -83,28 +78,23 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext super(distanceFunction, k, maxiter, initializer); } - /** - * Run k-medians. - * - * @param database Database - * @param relation relation to use - * @return result - */ + @Override public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + return new Clustering<>("k-Medians Clustering", "kmedians-clustering"); } // Choose initial medians - List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(relation, k, getDistanceFunction()); + List<? extends NumberVector<?>> medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null; for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) { - if (LOG.isVerbose()) { - LOG.verbose("K-Medians iteration " + (iteration + 1)); + if (prog != null) { + prog.incrementProcessed(LOG); } boolean changed = assignToNearestCluster(relation, medians, clusters); // Stop if no cluster assignment changed. @@ -114,12 +104,15 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext // Recompute medians. medians = medians(clusters, medians, relation); } + if (prog != null) { + prog.setCompleted(LOG); + } // Wrap result final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); - Clustering<MeanModel<V>> result = new Clustering<MeanModel<V>>("k-Medians Clustering", "kmedians-clustering"); + Clustering<MeanModel<V>> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering"); for (int i = 0; i < clusters.size(); i++) { - MeanModel<V> model = new MeanModel<V>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); - result.addCluster(new Cluster<MeanModel<V>>(clusters.get(i), model)); + MeanModel<V> model = new MeanModel<>(factory.newNumberVector(medians.get(i).getColumnVector().getArrayRef())); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -136,46 +129,15 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<NumberVector<?>, D> { - /** - * k Parameter. - */ - protected int k; - - /** - * Maximum number of iterations. - */ - protected int maxiter; - - /** - * Initialization method. - */ - protected KMeansInitialization<V> initializer; - + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> { @Override - protected void makeOptions(Parameterization config) { - super.makeOptions(config); - IntParameter kP = new IntParameter(K_ID); - kP.addConstraint(new GreaterConstraint(0)); - if (config.grab(kP)) { - k = kP.intValue(); - } - - ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<KMeansInitialization<V>>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class); - if (config.grab(initialP)) { - initializer = initialP.instantiateClass(config); - } - - IntParameter maxiterP = new IntParameter(MAXITER_ID, 0); - maxiterP.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(maxiterP)) { - maxiter = maxiterP.intValue(); - } + protected Logging getLogger() { + return LOG; } @Override protected KMediansLloyd<V, D> makeInstance() { - return new KMediansLloyd<V, D>(distanceFunction, k, maxiter, initializer); + return new KMediansLloyd<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java index f4398458..87a0c7ae 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -46,6 +46,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.math.Mean; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; @@ -119,13 +120,13 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); // Choose initial medoids ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -135,9 +136,13 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista // TODO: reuse this information, from the build phase, when possible? assignToNearestCluster(medoids, mdists, clusters, distQ); + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null; // Swap phase boolean changed = true; while (changed) { + if (prog != null) { + prog.incrementProcessed(LOG); + } changed = false; // Try to swap the medoid with a better cluster member: int i = 0; @@ -168,12 +173,15 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista assignToNearestCluster(medoids, mdists, clusters, distQ); } } + if (prog != null) { + prog.setCompleted(LOG); + } // Wrap result - Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); for (int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); - result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -256,7 +264,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista k = kP.intValue(); } - ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } @@ -270,7 +278,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista @Override protected KMedoidsEM<V, D> makeInstance() { - return new KMedoidsEM<V, D>(distanceFunction, k, maxiter, initializer); + return new KMedoidsEM<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java index 269e7e9e..136a4129 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsInitialization.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index 906501e4..1feda867 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -50,6 +50,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; @@ -124,14 +125,14 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist */ public Clustering<MedoidModel> run(Database database, Relation<V> relation) { if (relation.size() <= 0) { - return new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); DBIDs ids = relation.getDBIDs(); // Choose initial medoids ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); // Setup cluster assignment store - List<ModifiableDBIDs> clusters = new ArrayList<ModifiableDBIDs>(); + List<ModifiableDBIDs> clusters = new ArrayList<>(); for (int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } @@ -141,9 +142,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist // TODO: reuse this information, from the build phase, when possible? assignToNearestCluster(medoids, ids, second, clusters, distQ); + IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase boolean changed = true; while (changed) { + if (prog != null) { + prog.incrementProcessed(LOG); + } changed = false; // Try to swap the medoid with a better cluster member: double best = 0; @@ -189,6 +194,9 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } } } + if (prog != null) { + prog.setCompleted(LOG); + } if (LOG.isDebugging()) { LOG.debug("Best cost: " + best); } @@ -204,10 +212,10 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } // Wrap result - Clustering<MedoidModel> result = new Clustering<MedoidModel>("k-Medoids Clustering", "kmedoids-clustering"); + Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); for (int i = 0; i < clusters.size(); i++) { MedoidModel model = new MedoidModel(medoids.get(i)); - result.addCluster(new Cluster<MedoidModel>(clusters.get(i), model)); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); } return result; } @@ -293,7 +301,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist k = kP.intValue(); } - ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<KMedoidsInitialization<V>>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); + ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); if (config.grab(initialP)) { initializer = initialP.instantiateClass(config); } @@ -307,7 +315,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist @Override protected KMedoidsPAM<V, D> makeInstance() { - return new KMedoidsPAM<V, D>(distanceFunction, k, maxiter, initializer); + return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java index 1fc7160e..c7e1751f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/PAMInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,6 +25,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; @@ -69,16 +71,16 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { // Get a distance query if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) { throw new AbortException("PAM initialization can only be used with numerical distances."); } @SuppressWarnings("unchecked") final PrimitiveDistanceFunction<? super V, D> distF = (PrimitiveDistanceFunction<? super V, D>) distanceFunction; - final DistanceQuery<V, D> distQ = relation.getDatabase().getDistanceQuery(relation, distF); + final DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, distF); DBIDs medids = chooseInitialMedoids(k, distQ); - List<V> medoids = new ArrayList<V>(k); + List<V> medoids = new ArrayList<>(k); for(DBIDIter iter = medids.iter(); iter.valid(); iter.advance()) { medoids.add(relation.get(iter)); } @@ -179,7 +181,7 @@ public class PAMInitialMeans<V, D extends NumberDistance<D, ?>> implements KMean public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractParameterizer { @Override protected PAMInitialMeans<V, D> makeInstance() { - return new PAMInitialMeans<V, D>(); + return new PAMInitialMeans<>(); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java index 78e59be7..214f4ce6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyChosenInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -25,6 +25,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; import java.util.ArrayList; import java.util.List; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; @@ -52,9 +54,9 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd); - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { means.add(relation.get(iter)); } @@ -74,10 +76,9 @@ public class RandomlyChosenInitialMeans<V> extends AbstractKMeansInitialization< * @apiviz.exclude */ public static class Parameterizer<V> extends AbstractKMeansInitialization.Parameterizer<V> { - @Override protected RandomlyChosenInitialMeans<V> makeInstance() { - return new RandomlyChosenInitialMeans<V>(rnd); + return new RandomlyChosenInitialMeans<>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java index 300f5cb0..ee90e0dc 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,6 +27,7 @@ import java.util.List; import java.util.Random; import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; @@ -54,11 +55,11 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab } @Override - public List<V> chooseInitialMeans(Relation<V> relation, int k, PrimitiveDistanceFunction<? super V, ?> distanceFunction) { + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { final int dim = RelationUtil.dimensionality(relation); NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation); - List<V> means = new ArrayList<V>(k); + List<V> means = new ArrayList<>(k); final Random random = rnd.getRandom(); for(int i = 0; i < k; i++) { double[] r = MathUtil.randomDoubleArray(dim, random); @@ -81,7 +82,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab public static class Parameterizer<V extends NumberVector<?>> extends AbstractKMeansInitialization.Parameterizer<V> { @Override protected RandomlyGeneratedInitialMeans<V> makeInstance() { - return new RandomlyGeneratedInitialMeans<V>(rnd); + return new RandomlyGeneratedInitialMeans<>(rnd); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java new file mode 100644 index 00000000..9f0a1923 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java @@ -0,0 +1,160 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ProxyDatabase; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.ProxyView; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.utilities.RandomFactory; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Initialize k-means by running k-means on a sample of the data set only. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization<V> { + /** + * Variant of kMeans for the bisecting step. + */ + private KMeans<V, D, ?> innerkMeans; + + /** + * Sample size. + */ + private double rate; + + /** + * Constructor. + * + * @param rnd Random generator. + * @param innerkMeans Inner k-means algorithm. + * @param rate Sampling rate. + */ + public SampleKMeansInitialization(RandomFactory rnd, KMeans<V, D, ?> innerkMeans, double rate) { + super(rnd); + this.innerkMeans = innerkMeans; + this.rate = rate; + } + + @Override + public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) { + final int samplesize = (int) Math.ceil(rate * relation.size()); + final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), samplesize, rnd); + + ProxyView<V> proxyv = new ProxyView<>(database, sample, relation); + ProxyDatabase proxydb = new ProxyDatabase(sample, proxyv); + + innerkMeans.setK(k); + @SuppressWarnings("unchecked") + PrimitiveDistanceFunction<? super NumberVector<?>, D> df = (PrimitiveDistanceFunction<? super NumberVector<?>, D>) distanceFunction; + innerkMeans.setDistanceFunction(df); + Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv); + List<V> means = new ArrayList<>(); + for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) { + means.add((V) cluster.getModel().getMean()); + } + + return means; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + * @param <D> Distance type + */ + public static class Parameterizer<V extends NumberVector<?>, D extends Distance<?>> extends AbstractKMeansInitialization.Parameterizer<V> { + /** + * Parameter to specify the kMeans variant. + */ + public static final OptionID KMEANS_ID = new OptionID("kmeans.algorithm", "KMeans variant to run multiple times."); + + /** + * Parameter to specify the sampling rate. + */ + public static final OptionID SAMPLE_ID = new OptionID("kmeans.samplesize", "Sample set size (if > 1) or sampling rante (if < 1)."); + + /** + * Inner k-means algorithm to use. + */ + protected KMeans<V, D, ?> innerkMeans; + + /** + * Sampling rate. + */ + protected double rate; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<KMeans<V, D, ?>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class); + if (config.grab(kMeansVariantP)) { + ListParameterization kMeansVariantParameters = new ListParameterization(); + + // We will always invoke this with k as requested from outside! + kMeansVariantParameters.addParameter(KMeans.K_ID, 13); + kMeansVariantParameters.addParameter(KMeans.DISTANCE_FUNCTION_ID, SquaredEuclideanDistanceFunction.class); + + ChainedParameterization combinedConfig = new ChainedParameterization(kMeansVariantParameters, config); + combinedConfig.errorsTo(config); + innerkMeans = kMeansVariantP.instantiateClass(combinedConfig); + } + + DoubleParameter sampleP = new DoubleParameter(SAMPLE_ID); + if (config.grab(sampleP)) { + rate = sampleP.doubleValue(); + } + } + + @Override + protected SampleKMeansInitialization<V, D> makeInstance() { + return new SampleKMeansInitialization<>(rnd, innerkMeans, rate); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java index 2ce625b0..aa4c3e24 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java new file mode 100644 index 00000000..f2de7846 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/KMeansQualityMeasure.java @@ -0,0 +1,54 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; + +/** + * Interface for computing the quality of a K-Means clustering. + * + * @author Erich Schubert + * + * @param <O> Input Object restriction type + * @param <D> Distance restriction type + */ +public interface KMeansQualityMeasure<O extends NumberVector<?>, D extends Distance<?>> { + /** + * Calculates and returns the quality measure. + * + * @param clustering Clustering to analyze + * @param distanceFunction Distance function to use (usually Euclidean or + * squared Euclidean!) + * @param relation Relation for accessing objects + * @param <V> Actual vector type (could be a subtype of O!) + * + * @return quality measure + */ + <V extends O> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends D> distanceFunction, Relation<V> relation); +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java new file mode 100644 index 00000000..e0ddfff0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterMeanDistanceQualityMeasure.java @@ -0,0 +1,89 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; + +/** + * Class for computing the average overall distance. + * + * The average of all average pairwise distances in a cluster. + * + * @author Stephan Baier + */ +public class WithinClusterMeanDistanceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> { + @Override + public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) { + @SuppressWarnings("unchecked") + final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters(); + + if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction; + double clusterDistanceSum = 0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + + // Compute sum of pairwise distances: + double clusterPairwiseDistanceSum = 0; + for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { + V obj1 = relation.get(iter1); + for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + clusterPairwiseDistanceSum += df.doubleDistance(obj1, relation.get(iter2)); + } + } + clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); + } + + return clusterDistanceSum / clusterList.size(); + } else { + double clusterDistanceSum = 0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + + // Compute sum of pairwise distances: + double clusterPairwiseDistanceSum = 0; + for (DBIDIter iter1 = ids.iter(); iter1.valid(); iter1.advance()) { + V obj1 = relation.get(iter1); + for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) { + clusterPairwiseDistanceSum += distanceFunction.distance(obj1, relation.get(iter2)).doubleValue(); + } + } + clusterDistanceSum += clusterPairwiseDistanceSum / (ids.size() * ids.size()); + } + + return clusterDistanceSum / clusterList.size(); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java new file mode 100644 index 00000000..32ad5210 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/WithinClusterVarianceQualityMeasure.java @@ -0,0 +1,83 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; + +/* + This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.MeanModel; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; + +/** + * Class for computing the variance in a clustering result (sum-of-squares). + * + * @author Stephan Baier + */ +public class WithinClusterVarianceQualityMeasure implements KMeansQualityMeasure<NumberVector<?>, NumberDistance<?, ?>> { + @Override + public <V extends NumberVector<?>> double calculateCost(Clustering<? extends MeanModel<V>> clustering, PrimitiveDistanceFunction<? super V, ? extends NumberDistance<?, ?>> distanceFunction, Relation<V> relation) { + @SuppressWarnings("unchecked") + final List<Cluster<MeanModel<V>>> clusterList = (List<Cluster<MeanModel<V>>>) (List<?>) clustering.getAllClusters(); + + boolean squared = (distanceFunction instanceof SquaredEuclideanDistanceFunction); + if (distanceFunction instanceof PrimitiveDoubleDistanceFunction) { + @SuppressWarnings("unchecked") + PrimitiveDoubleDistanceFunction<? super V> df = (PrimitiveDoubleDistanceFunction<? super V>) distanceFunction; + double variance = 0.0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + V mean = cluster.getModel().getMean(); + + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double dist = df.doubleDistance(relation.get(iter), mean); + if (squared) { + variance += dist; + } else { + variance += dist * dist; + } + } + } + return variance; + } else { + double variance = 0.0; + for (Cluster<MeanModel<V>> cluster : clusterList) { + DBIDs ids = cluster.getIDs(); + V mean = cluster.getModel().getMean(); + + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + double dist = distanceFunction.distance(relation.get(iter), mean).doubleValue(); + variance += dist * dist; + } + } + return variance; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java new file mode 100644 index 00000000..ed9a528d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java @@ -0,0 +1,4 @@ +/** + * Quality measures for k-Means results. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality;
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java index 4ba1ce09..26fb3024 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/package-info.java @@ -19,7 +19,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java index 37b3eb57..db026e93 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -172,7 +172,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster if(LOG.isVerbose()) { LOG.verbose("*** 1. Identification of subspaces that contain clusters ***"); } - SortedMap<Integer, List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new TreeMap<Integer, List<CLIQUESubspace<V>>>(); + SortedMap<Integer, List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new TreeMap<>(); List<CLIQUESubspace<V>> denseSubspaces = findOneDimensionalDenseSubspaces(relation); dimensionToDenseSubspaces.put(Integer.valueOf(0), denseSubspaces); if(LOG.isVerbose()) { @@ -204,7 +204,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } // build result int numClusters = 1; - Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("CLIQUE clustering", "clique-clustering"); + Clustering<SubspaceModel<V>> result = new Clustering<>("CLIQUE clustering", "clique-clustering"); for(Integer dim : dimensionToDenseSubspaces.keySet()) { List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim); List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces); @@ -214,10 +214,10 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } for(Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) { - Cluster<SubspaceModel<V>> newCluster = new Cluster<SubspaceModel<V>>(modelAndCluster.second); - newCluster.setModel(new SubspaceModel<V>(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).toVector(relation))); + Cluster<SubspaceModel<V>> newCluster = new Cluster<>(modelAndCluster.second); + newCluster.setModel(new SubspaceModel<>(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).toVector(relation))); newCluster.setName("cluster_" + numClusters++); - result.addCluster(newCluster); + result.addToplevelCluster(newCluster); } } @@ -233,7 +233,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * cluster models */ private List<Pair<Subspace, ModifiableDBIDs>> determineClusters(List<CLIQUESubspace<V>> denseSubspaces) { - List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace, ModifiableDBIDs>>(); + List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<>(); for(CLIQUESubspace<V> subspace : denseSubspaces) { List<Pair<Subspace, ModifiableDBIDs>> clustersInSubspace = subspace.determineClusters(); @@ -339,7 +339,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } // build the 1 dimensional units - List<CLIQUEUnit<V>> units = new ArrayList<CLIQUEUnit<V>>((xsi * dimensionality)); + List<CLIQUEUnit<V>> units = new ArrayList<>((xsi * dimensionality)); for(int x = 0; x < xsi; x++) { for(int d = 0; d < dimensionality; d++) { units.add(new CLIQUEUnit<V>(new Interval(d, unit_bounds[x][d], unit_bounds[x + 1][d]))); @@ -396,8 +396,8 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } } - Collection<CLIQUEUnit<V>> denseUnits = new ArrayList<CLIQUEUnit<V>>(); - Map<Integer, CLIQUESubspace<V>> denseSubspaces = new HashMap<Integer, CLIQUESubspace<V>>(); + Collection<CLIQUEUnit<V>> denseUnits = new ArrayList<>(); + Map<Integer, CLIQUESubspace<V>> denseSubspaces = new HashMap<>(); for(CLIQUEUnit<V> unit : units) { // unit is a dense unit if(unit.selectivity(total) >= tau) { @@ -406,7 +406,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster int dim = unit.getIntervals().iterator().next().getDimension(); CLIQUESubspace<V> subspace_d = denseSubspaces.get(Integer.valueOf(dim)); if(subspace_d == null) { - subspace_d = new CLIQUESubspace<V>(dim); + subspace_d = new CLIQUESubspace<>(dim); denseSubspaces.put(Integer.valueOf(dim), subspace_d); } subspace_d.addDenseUnit(unit); @@ -420,7 +420,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster LOG.debugFine(msg.toString()); } - List<CLIQUESubspace<V>> subspaceCandidates = new ArrayList<CLIQUESubspace<V>>(denseSubspaces.values()); + List<CLIQUESubspace<V>> subspaceCandidates = new ArrayList<>(denseSubspaces.values()); Collections.sort(subspaceCandidates, new CLIQUESubspace.CoverageComparator()); return subspaceCandidates; } @@ -436,12 +436,12 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster */ private List<CLIQUESubspace<V>> findDenseSubspaceCandidates(Relation<V> database, List<CLIQUESubspace<V>> denseSubspaces) { // sort (k-1)-dimensional dense subspace according to their dimensions - List<CLIQUESubspace<V>> denseSubspacesByDimensions = new ArrayList<CLIQUESubspace<V>>(denseSubspaces); + List<CLIQUESubspace<V>> denseSubspacesByDimensions = new ArrayList<>(denseSubspaces); Collections.sort(denseSubspacesByDimensions, new Subspace.DimensionComparator()); // determine k-dimensional dense subspace candidates double all = database.size(); - List<CLIQUESubspace<V>> denseSubspaceCandidates = new ArrayList<CLIQUESubspace<V>>(); + List<CLIQUESubspace<V>> denseSubspaceCandidates = new ArrayList<>(); while(!denseSubspacesByDimensions.isEmpty()) { CLIQUESubspace<V> s1 = denseSubspacesByDimensions.remove(0); @@ -614,7 +614,7 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster @Override protected CLIQUE<V> makeInstance() { - return new CLIQUE<V>(xsi, tau, prune); + return new CLIQUE<>(xsi, tau, prune); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java index a3496a0e..b17ebebb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -62,7 +62,8 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; import de.lmu.ifi.dbs.elki.utilities.FormatUtil; -import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyReferenceLists; +import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy; +import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; @@ -238,29 +239,29 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } // build the hierarchy - buildHierarchy(database, distFunc, clusters, dimensionality); + Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering"); + buildHierarchy(database, distFunc, clustering, clusters, dimensionality); if (LOG.isVerbose()) { StringBuilder msg = new StringBuilder("Step 4: build hierarchy"); for (Cluster<SubspaceModel<V>> c : clusters) { msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size()); - for (Cluster<SubspaceModel<V>> cluster : c.getParents()) { - msg.append("\n parent ").append(cluster); + for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) { + msg.append("\n parent ").append(iter.get()); } - for (Cluster<SubspaceModel<V>> cluster : c.getChildren()) { - msg.append("\n child ").append(cluster); + for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) { + msg.append("\n child ").append(iter.get()); } } LOG.verbose(msg.toString()); } // build result - Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("DiSH clustering", "dish-clustering"); for (Cluster<SubspaceModel<V>> c : clusters) { - if (c.getParents() == null || c.getParents().isEmpty()) { - result.addCluster(c); + if (clustering.getClusterHierarchy().numParents(c) == 0) { + clustering.addToplevelCluster(c); } } - return result; + return clustering; } /** @@ -274,9 +275,9 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin private Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> extractClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder) { FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), LOG) : null; int processed = 0; - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>>(); - Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>>(); - Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<DBID, Pair<BitSet, ArrayModifiableDBIDs>>(); + Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>(); + Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>(); + Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>(); for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) { ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next(); entryMap.put(entry.getID(), entry); @@ -287,7 +288,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // get the list of (parallel) clusters for the preference vector List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector); if (parallelClusters == null) { - parallelClusters = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(); + parallelClusters = new ArrayList<>(); clustersMap.put(preferenceVector, parallelClusters); } @@ -305,7 +306,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } if (cluster == null) { - cluster = new Pair<BitSet, ArrayModifiableDBIDs>(preferenceVector, DBIDUtil.newArray()); + cluster = new Pair<>(preferenceVector, DBIDUtil.newArray()); parallelClusters.add(cluster); } cluster.second.add(entry.getID()); @@ -373,15 +374,13 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin private List<Cluster<SubspaceModel<V>>> sortClusters(Relation<V> database, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) { final int db_dim = RelationUtil.dimensionality(database); // int num = 1; - List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<Cluster<SubspaceModel<V>>>(); + List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>(); for (BitSet pv : clustersMap.keySet()) { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); for (int i = 0; i < parallelClusters.size(); i++) { Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i); - Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.second); - cluster.setModel(new SubspaceModel<V>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); - cluster.setHierarchy(new HierarchyReferenceLists<Cluster<SubspaceModel<V>>>(cluster, new ArrayList<Cluster<SubspaceModel<V>>>(), new ArrayList<Cluster<SubspaceModel<V>>>())); - // cluster.setName("Cluster_" + num++); + Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second); + cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database))); String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, ""); if (parallelClusters.size() > 1) { cluster.setName("Cluster_" + subspace + "_" + i); @@ -415,9 +414,9 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin private void checkClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap, int minpts) { // check if there are clusters < minpts // and add them to not assigned - List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(); - Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>>(); - Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<BitSet, ArrayModifiableDBIDs>(new BitSet(), DBIDUtil.newArray()); + List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>(); + Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>(); + Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray()); for (BitSet pv : clustersMap.keySet()) { // noise if (pv.cardinality() == 0) { @@ -429,7 +428,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // clusters else { List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv); - List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(parallelClusters.size()); + List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size()); for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) { if (!pv.equals(new BitSet()) && c.second.size() < minpts) { notAssigned.add(c); @@ -456,7 +455,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin } } - List<Pair<BitSet, ArrayModifiableDBIDs>> noiseList = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(1); + List<Pair<BitSet, ArrayModifiableDBIDs>> noiseList = new ArrayList<>(1); noiseList.add(noise); clustersMap.put(noise.first, noiseList); } @@ -510,13 +509,15 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * Builds the cluster hierarchy. * * @param distFunc the distance function + * @param clustering Clustering we process * @param clusters the sorted list of clusters * @param dimensionality the dimensionality of the data * @param database the database containing the data objects */ - private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) { + private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Clustering<SubspaceModel<V>> clustering, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) { StringBuilder msg = new StringBuilder(); final int db_dim = RelationUtil.dimensionality(database); + Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy(); for (int i = 0; i < clusters.size() - 1; i++) { Cluster<SubspaceModel<V>> c_i = clusters.get(i); @@ -536,9 +537,8 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin // noise level reached if (c_j.getModel().getSubspace().dimensionality() == 0) { // no parents exists -> parent is noise - if (c_i.getParents().isEmpty()) { - c_j.getChildren().add(c_i); - c_i.getParents().add(c_j); + if (hier.numParents(c_i) == 0) { + clustering.addChildCluster(c_j, c_i); if (LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())); @@ -560,9 +560,8 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin if (d <= 2 * epsilon) { // no parent exists or c_j is not a parent of the already // existing parents - if (c_i.getParents().isEmpty() || !isParent(database, distFunc, c_j, c_i.getParents())) { - c_j.getChildren().add(c_i); - c_i.getParents().add(c_j); + if (hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) { + clustering.addChildCluster(c_j, c_i); if (LOG.isDebugging()) { msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())); msg.append("] is parent of ["); @@ -591,16 +590,17 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin * @param distFunc the distance function for distance computation between the * clusters * @param parent the parent to be tested - * @param children the list of children to be tested + * @param iter the list of children to be tested * @return true, if the specified parent cluster is a parent of one child of * the children clusters, false otherwise */ - private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, List<Cluster<SubspaceModel<V>>> children) { + private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, Iter<Cluster<SubspaceModel<V>>> iter) { V parent_centroid = ProjectedCentroid.make(parent.getModel().getDimensions(), database, parent.getIDs()).toVector(database); int dimensionality = RelationUtil.dimensionality(database); int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality(); - for (Cluster<SubspaceModel<V>> child : children) { + for (; iter.valid(); iter.advance()) { + Cluster<SubspaceModel<V>> child = iter.get(); V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database); PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions()); if (distance.getCorrelationValue() == subspaceDim_parent) { @@ -699,7 +699,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin @Override protected DiSH<V> makeInstance() { - return new DiSH<V>(epsilon, dishDistance, opticsO); + return new DiSH<>(epsilon, dishDistance, opticsO); } } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java index 58f3acef..9ac7c072 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java @@ -3,7 +3,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -119,7 +119,7 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB @Override
protected HiSC<V> makeInstance() {
- return new HiSC<V>(distanceFunction);
+ return new HiSC<>(distanceFunction);
}
}
}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java index ef49ff10..92158734 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -49,13 +49,13 @@ import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResult; import de.lmu.ifi.dbs.elki.distance.distanceresultlist.DistanceDBIDResultUtil; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -150,13 +150,13 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc); final Random random = rnd.getRandom(); - if(RelationUtil.dimensionality(relation) < l) { + if (RelationUtil.dimensionality(relation) < l) { throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")"); } // TODO: use a StepProgress! // initialization phase - if(LOG.isVerbose()) { + if (LOG.isVerbose()) { LOG.verbose("1. Initialization phase..."); } int sampleSize = Math.min(relation.size(), k_i * k); @@ -165,7 +165,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster int medoidSize = Math.min(relation.size(), m_i * k); DBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random); - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("sampleSize ").append(sampleSize).append('\n'); @@ -176,7 +176,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } // iterative phase - if(LOG.isVerbose()) { + if (LOG.isVerbose()) { LOG.verbose("2. Iterative phase..."); } double bestObjective = Double.POSITIVE_INFINITY; @@ -184,7 +184,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster ModifiableDBIDs m_bad = null; ModifiableDBIDs m_current = initialSet(medoids, k, random); - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("m_c ").append(m_current).append('\n'); @@ -196,12 +196,12 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster // TODO: Use DataStore and Trove for performance Map<DBID, PROCLUSCluster> clusters = null; int loops = 0; - while(loops < 10) { + while (loops < 10) { Map<DBID, TIntSet> dimensions = findDimensions(m_current, relation, distFunc, rangeQuery); clusters = assignPoints(dimensions, relation); double objectiveFunction = evaluateClusters(clusters, dimensions, relation); - if(objectiveFunction < bestObjective) { + if (objectiveFunction < bestObjective) { // restart counting loops loops = 0; bestObjective = objectiveFunction; @@ -211,32 +211,32 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster m_current = computeM_current(medoids, m_best, m_bad, random); loops++; - if(cprogress != null) { + if (cprogress != null) { cprogress.setProcessed(clusters.size(), LOG); } } - if(cprogress != null) { + if (cprogress != null) { cprogress.setCompleted(LOG); } // refinement phase - if(LOG.isVerbose()) { + if (LOG.isVerbose()) { LOG.verbose("3. Refinement phase..."); } - List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<PROCLUSCluster>(clusters.values()), relation); + List<Pair<V, TIntSet>> dimensions = findDimensions(new ArrayList<>(clusters.values()), relation); List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation); // build result int numClusters = 1; - Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("ProClus clustering", "proclus-clustering"); - for(PROCLUSCluster c : finalClusters) { - Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.objectIDs); - cluster.setModel(new SubspaceModel<V>(new Subspace(c.getDimensions()), c.centroid)); + Clustering<SubspaceModel<V>> result = new Clustering<>("ProClus clustering", "proclus-clustering"); + for (PROCLUSCluster c : finalClusters) { + Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.objectIDs); + cluster.setModel(new SubspaceModel<>(new Subspace(c.getDimensions()), c.centroid)); cluster.setName("cluster_" + numClusters++); - result.addCluster(cluster); + result.addToplevelCluster(cluster); } return result; } @@ -257,22 +257,22 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster // m_1 is random point of S DBID m_i = s.remove(random.nextInt(s.size())); medoids.add(m_i); - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { LOG.debugFiner("medoids " + medoids); } // compute distances between each point in S and m_i // FIXME: don't use maps, so we can work with DBIDRef - Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<DBID, DistanceDBIDPair<DoubleDistance>>(); - for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { + Map<DBID, DistanceDBIDPair<DoubleDistance>> distances = new HashMap<>(); + for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { DBID id = DBIDUtil.deref(iter); DoubleDistance dist = distFunc.distance(id, m_i); distances.put(id, DBIDUtil.newDistancePair(dist, id)); } - for(int i = 1; i < m; i++) { - // choose medoid m_i to be far from prevois medoids - List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<DistanceDBIDPair<DoubleDistance>>(distances.values()); + for (int i = 1; i < m; i++) { + // choose medoid m_i to be far from previous medoids + List<DistanceDBIDPair<DoubleDistance>> d = new ArrayList<>(distances.values()); DistanceDBIDResultUtil.sortByDistance(d); m_i = DBIDUtil.deref(d.get(d.size() - 1)); @@ -281,7 +281,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster distances.remove(m_i); // compute distances of each point to closest medoid - for(DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { + for (DBIDIter iter = s.iter(); iter.valid(); iter.advance()) { DBID id = DBIDUtil.deref(iter); DoubleDistance dist_new = distFunc.distance(id, m_i); DoubleDistance dist_old = distances.get(id).getDistance(); @@ -290,7 +290,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster distances.put(id, DBIDUtil.newDistancePair(dist, id)); } - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { LOG.debugFiner("medoids " + medoids); } } @@ -309,7 +309,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster private ModifiableDBIDs initialSet(DBIDs sampleSet, int k, Random random) { ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet); ModifiableDBIDs initialSet = DBIDUtil.newHashSet(); - while(initialSet.size() < k) { + while (initialSet.size() < k) { DBID next = s.remove(random.nextInt(s.size())); initialSet.add(next); } @@ -330,16 +330,15 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster m_list.removeDBIDs(m_best); ModifiableDBIDs m_current = DBIDUtil.newHashSet(); - for(DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { + for (DBIDIter iter = m_best.iter(); iter.valid(); iter.advance()) { DBID m_i = DBIDUtil.deref(iter); - if(m_bad.contains(m_i)) { + if (m_bad.contains(m_i)) { int currentSize = m_current.size(); - while(m_current.size() == currentSize) { + while (m_current.size() == currentSize) { DBID next = m_list.remove(random.nextInt(m_list.size())); m_current.add(next); } - } - else { + } else { m_current.add(m_i); } } @@ -358,28 +357,28 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @param distFunc the distance function * @return a mapping of the medoid's id to its locality */ - private Map<DBID, DistanceDBIDResult<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { - Map<DBID, DistanceDBIDResult<DoubleDistance>> result = new HashMap<DBID, DistanceDBIDResult<DoubleDistance>>(); + private Map<DBID, DistanceDBIDList<DoubleDistance>> getLocalities(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { + Map<DBID, DistanceDBIDList<DoubleDistance>> result = new HashMap<>(); - for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { DBID m = DBIDUtil.deref(iter); // determine minimum distance between current medoid m and any other // medoid m_i DoubleDistance minDist = null; - for(DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { + for (DBIDIter iter2 = medoids.iter(); iter2.valid(); iter2.advance()) { DBID m_i = DBIDUtil.deref(iter2); - if(DBIDUtil.equal(m_i, m)) { + if (DBIDUtil.equal(m_i, m)) { continue; } DoubleDistance currentDist = distFunc.distance(m, m_i); - if(minDist == null || currentDist.compareTo(minDist) < 0) { + if (minDist == null || currentDist.compareTo(minDist) < 0) { minDist = currentDist; } } // determine points in sphere centered at m with radius minDist assert minDist != null; - DistanceDBIDResult<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist); + DistanceDBIDList<DoubleDistance> qr = rangeQuery.getRangeForDBID(m, minDist); result.put(m, qr); } @@ -398,32 +397,32 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster */ private Map<DBID, TIntSet> findDimensions(DBIDs medoids, Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, RangeQuery<V, DoubleDistance> rangeQuery) { // get localities - Map<DBID, DistanceDBIDResult<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery); + Map<DBID, DistanceDBIDList<DoubleDistance>> localities = getLocalities(medoids, database, distFunc, rangeQuery); // compute x_ij = avg distance from points in l_i to medoid m_i int dim = RelationUtil.dimensionality(database); - Map<DBID, double[]> averageDistances = new HashMap<DBID, double[]>(); + Map<DBID, double[]> averageDistances = new HashMap<>(); - for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { DBID m_i = DBIDUtil.deref(iter); V medoid_i = database.get(m_i); - DistanceDBIDResult<DoubleDistance> l_i = localities.get(m_i); + DistanceDBIDList<DoubleDistance> l_i = localities.get(m_i); double[] x_i = new double[dim]; - for(DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { + for (DBIDIter qr = l_i.iter(); qr.valid(); qr.advance()) { V o = database.get(qr); - for(int d = 0; d < dim; d++) { + for (int d = 0; d < dim; d++) { x_i[d] += Math.abs(medoid_i.doubleValue(d) - o.doubleValue(d)); } } - for(int d = 0; d < dim; d++) { + for (int d = 0; d < dim; d++) { x_i[d] /= l_i.size(); } averageDistances.put(m_i, x_i); } - Map<DBID, TIntSet> dimensionMap = new HashMap<DBID, TIntSet>(); - List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<CTriple<Double, DBID, Integer>>(); - for(DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { + Map<DBID, TIntSet> dimensionMap = new HashMap<>(); + List<CTriple<Double, DBID, Integer>> z_ijs = new ArrayList<>(); + for (DBIDIter iter = medoids.iter(); iter.valid(); iter.advance()) { DBID m_i = DBIDUtil.deref(iter); TIntSet dims_i = new TIntHashSet(); dimensionMap.put(m_i, dims_i); @@ -431,33 +430,33 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster double[] x_i = averageDistances.get(m_i); // y_i double y_i = 0; - for(int j = 0; j < dim; j++) { + for (int j = 0; j < dim; j++) { y_i += x_i[j]; } y_i /= dim; // sigma_i double sigma_i = 0; - for(int j = 0; j < dim; j++) { + for (int j = 0; j < dim; j++) { double diff = x_i[j] - y_i; sigma_i += diff * diff; } sigma_i /= (dim - 1); sigma_i = Math.sqrt(sigma_i); - for(int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<Double, DBID, Integer>((x_i[j] - y_i) / sigma_i, m_i, j)); + for (int j = 0; j < dim; j++) { + z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, m_i, j)); } } Collections.sort(z_ijs); int max = Math.max(k * l, 2); - for(int m = 0; m < max; m++) { + for (int m = 0; m < max; m++) { CTriple<Double, DBID, Integer> z_ij = z_ijs.get(m); TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); dims_i.add(z_ij.getThird()); - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("z_ij ").append(z_ij).append('\n'); @@ -480,61 +479,61 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster private List<Pair<V, TIntSet>> findDimensions(List<PROCLUSCluster> clusters, Relation<V> database) { // compute x_ij = avg distance from points in c_i to c_i.centroid int dim = RelationUtil.dimensionality(database); - Map<Integer, double[]> averageDistances = new HashMap<Integer, double[]>(); + Map<Integer, double[]> averageDistances = new HashMap<>(); - for(int i = 0; i < clusters.size(); i++) { + for (int i = 0; i < clusters.size(); i++) { PROCLUSCluster c_i = clusters.get(i); double[] x_i = new double[dim]; - for(DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { + for (DBIDIter iter = c_i.objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); - for(int d = 0; d < dim; d++) { + for (int d = 0; d < dim; d++) { x_i[d] += Math.abs(c_i.centroid.doubleValue(d) - o.doubleValue(d)); } } - for(int d = 0; d < dim; d++) { + for (int d = 0; d < dim; d++) { x_i[d] /= c_i.objectIDs.size(); } averageDistances.put(i, x_i); } - List<CTriple<Double, Integer, Integer>> z_ijs = new ArrayList<CTriple<Double, Integer, Integer>>(); - for(int i = 0; i < clusters.size(); i++) { + List<CTriple<Double, Integer, Integer>> z_ijs = new ArrayList<>(); + for (int i = 0; i < clusters.size(); i++) { double[] x_i = averageDistances.get(i); // y_i double y_i = 0; - for(int j = 0; j < dim; j++) { + for (int j = 0; j < dim; j++) { y_i += x_i[j]; } y_i /= dim; // sigma_i double sigma_i = 0; - for(int j = 0; j < dim; j++) { + for (int j = 0; j < dim; j++) { double diff = x_i[j] - y_i; sigma_i += diff * diff; } sigma_i /= (dim - 1); sigma_i = Math.sqrt(sigma_i); - for(int j = 0; j < dim; j++) { - z_ijs.add(new CTriple<Double, Integer, Integer>((x_i[j] - y_i) / sigma_i, i, j)); + for (int j = 0; j < dim; j++) { + z_ijs.add(new CTriple<>((x_i[j] - y_i) / sigma_i, i, j)); } } Collections.sort(z_ijs); // mapping cluster index -> dimensions - Map<Integer, TIntSet> dimensionMap = new HashMap<Integer, TIntSet>(); + Map<Integer, TIntSet> dimensionMap = new HashMap<>(); int max = Math.max(k * l, 2); - for(int m = 0; m < max; m++) { + for (int m = 0; m < max; m++) { CTriple<Double, Integer, Integer> z_ij = z_ijs.get(m); TIntSet dims_i = dimensionMap.get(z_ij.getSecond()); - if(dims_i == null) { + if (dims_i == null) { dims_i = new TIntHashSet(); dimensionMap.put(z_ij.getSecond(), dims_i); } dims_i.add(z_ij.getThird()); - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("z_ij ").append(z_ij).append('\n'); @@ -544,11 +543,11 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster } // mapping cluster -> dimensions - List<Pair<V, TIntSet>> result = new ArrayList<Pair<V, TIntSet>>(); - for(int i : dimensionMap.keySet()) { + List<Pair<V, TIntSet>> result = new ArrayList<>(); + for (int i : dimensionMap.keySet()) { TIntSet dims_i = dimensionMap.get(i); PROCLUSCluster c_i = clusters.get(i); - result.add(new Pair<V, TIntSet>(c_i.centroid, dims_i)); + result.add(new Pair<>(c_i.centroid, dims_i)); } return result; } @@ -562,19 +561,19 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the assignments of the object to the clusters */ private Map<DBID, PROCLUSCluster> assignPoints(Map<DBID, TIntSet> dimensions, Relation<V> database) { - Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<DBID, ModifiableDBIDs>(); - for(DBID m_i : dimensions.keySet()) { + Map<DBID, ModifiableDBIDs> clusterIDs = new HashMap<>(); + for (DBID m_i : dimensions.keySet()) { clusterIDs.put(m_i, DBIDUtil.newHashSet()); } - for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { DBID p_id = DBIDUtil.deref(it); V p = database.get(p_id); DistanceDBIDPair<DoubleDistance> minDist = null; - for(DBID m_i : dimensions.keySet()) { + for (DBID m_i : dimensions.keySet()) { V m = database.get(m_i); DistanceDBIDPair<DoubleDistance> currentDist = DBIDUtil.newDistancePair(manhattanSegmentalDistance(p, m, dimensions.get(m_i)), m_i); - if(minDist == null || currentDist.compareByDistance(minDist) < 0) { + if (minDist == null || currentDist.compareByDistance(minDist) < 0) { minDist = currentDist; } } @@ -584,17 +583,17 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster ids.add(p_id); } - Map<DBID, PROCLUSCluster> clusters = new HashMap<DBID, PROCLUSCluster>(); - for(DBID m_i : dimensions.keySet()) { + Map<DBID, PROCLUSCluster> clusters = new HashMap<>(); + for (DBID m_i : dimensions.keySet()) { ModifiableDBIDs objectIDs = clusterIDs.get(m_i); - if(!objectIDs.isEmpty()) { + if (!objectIDs.isEmpty()) { TIntSet clusterDimensions = dimensions.get(m_i); V centroid = Centroid.make(database, objectIDs).toVector(database); clusters.put(m_i, new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); } } - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("clusters ").append(clusters).append('\n'); @@ -612,22 +611,22 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster * @return the assignments of the object to the clusters */ private List<PROCLUSCluster> finalAssignment(List<Pair<V, TIntSet>> dimensions, Relation<V> database) { - Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<Integer, ModifiableDBIDs>(); - for(int i = 0; i < dimensions.size(); i++) { + Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>(); + for (int i = 0; i < dimensions.size(); i++) { clusterIDs.put(i, DBIDUtil.newHashSet()); } - for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { + for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { DBID p_id = DBIDUtil.deref(it); V p = database.get(p_id); Pair<DoubleDistance, Integer> minDist = null; - for(int i = 0; i < dimensions.size(); i++) { + for (int i = 0; i < dimensions.size(); i++) { Pair<V, TIntSet> pair_i = dimensions.get(i); V c_i = pair_i.first; TIntSet dimensions_i = pair_i.second; DoubleDistance currentDist = manhattanSegmentalDistance(p, c_i, dimensions_i); - if(minDist == null || currentDist.compareTo(minDist.first) < 0) { - minDist = new Pair<DoubleDistance, Integer>(currentDist, i); + if (minDist == null || currentDist.compareTo(minDist.first) < 0) { + minDist = new Pair<>(currentDist, i); } } // add p to cluster with mindist @@ -636,17 +635,17 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster ids.add(p_id); } - List<PROCLUSCluster> clusters = new ArrayList<PROCLUSCluster>(); - for(int i = 0; i < dimensions.size(); i++) { + List<PROCLUSCluster> clusters = new ArrayList<>(); + for (int i = 0; i < dimensions.size(); i++) { ModifiableDBIDs objectIDs = clusterIDs.get(i); - if(!objectIDs.isEmpty()) { + if (!objectIDs.isEmpty()) { TIntSet clusterDimensions = dimensions.get(i).second; V centroid = Centroid.make(database, objectIDs).toVector(database); clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid)); } } - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append('\n'); msg.append("clusters ").append(clusters).append('\n'); @@ -667,7 +666,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster */ private DoubleDistance manhattanSegmentalDistance(V o1, V o2, TIntSet dimensions) { double result = 0; - for (TIntIterator iter = dimensions.iterator(); iter.hasNext(); ) { + for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { final int d = iter.next(); result += Math.abs(o1.doubleValue(d) - o2.doubleValue(d)); } @@ -685,13 +684,13 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster */ private double evaluateClusters(Map<DBID, PROCLUSCluster> clusters, Map<DBID, TIntSet> dimensions, Relation<V> database) { double result = 0; - for(DBID m_i : clusters.keySet()) { + for (DBID m_i : clusters.keySet()) { PROCLUSCluster c_i = clusters.get(m_i); V centroid_i = c_i.centroid; TIntSet dims_i = dimensions.get(m_i); double w_i = 0; - for (TIntIterator iter = dims_i.iterator(); iter.hasNext(); ) { + for (TIntIterator iter = dims_i.iterator(); iter.hasNext();) { final int j = iter.next(); w_i += avgDistance(centroid_i, c_i.objectIDs, database, j); } @@ -716,7 +715,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster */ private double avgDistance(V centroid, DBIDs objectIDs, Relation<V> database, int dimension) { Mean avg = new Mean(); - for(DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { + for (DBIDIter iter = objectIDs.iter(); iter.valid(); iter.advance()) { V o = database.get(iter); avg.put(Math.abs(centroid.doubleValue(dimension) - o.doubleValue(dimension))); } @@ -733,9 +732,9 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster */ private ModifiableDBIDs computeBadMedoids(Map<DBID, PROCLUSCluster> clusters, int threshold) { ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(); - for(DBID m_i : clusters.keySet()) { + for (DBID m_i : clusters.keySet()) { PROCLUSCluster c_i = clusters.get(m_i); - if(c_i.objectIDs.size() < threshold) { + if (c_i.objectIDs.size() < threshold) { badMedoids.add(m_i); } } @@ -791,11 +790,10 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster StringBuilder result = new StringBuilder(); result.append("Dimensions: ["); boolean notFirst = false; - for(TIntIterator iter = dimensions.iterator(); iter.hasNext(); ) { - if(notFirst) { + for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { + if (notFirst) { result.append(','); - } - else { + } else { notFirst = true; } result.append(iter.next()); @@ -813,7 +811,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster */ public BitSet getDimensions() { BitSet result = new BitSet(); - for(TIntIterator iter = dimensions.iterator(); iter.hasNext(); ) { + for (TIntIterator iter = dimensions.iterator(); iter.hasNext();) { result.set(iter.next()); } return result; @@ -847,19 +845,19 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster IntParameter m_iP = new IntParameter(M_I_ID, 10); m_iP.addConstraint(new GreaterConstraint(0)); - if(config.grab(m_iP)) { + if (config.grab(m_iP)) { m_i = m_iP.getValue(); } RandomParameter rndP = new RandomParameter(SEED_ID); - if(config.grab(rndP)) { + if (config.grab(rndP)) { rnd = rndP.getValue(); } } @Override protected PROCLUS<V> makeInstance() { - return new PROCLUS<V>(k, k_i, l, m_i, rnd); + return new PROCLUS<>(k, k_i, l, m_i, rnd); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java index fc3228eb..4e670974 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PreDeCon.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -111,7 +111,7 @@ public class PreDeCon<V extends NumberVector<?>> extends AbstractProjectedDBSCAN @Override protected PreDeCon<V> makeInstance() { - return new PreDeCon<V>(epsilon, minpts, outerdist, lambda); + return new PreDeCon<>(epsilon, minpts, outerdist, lambda); } } }
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java index 46c5f0b8..c8d0833e 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -43,7 +43,7 @@ import de.lmu.ifi.dbs.elki.database.ProxyDatabase; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.AbstractDimensionsSelectingDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionSelectingSubspaceDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -105,7 +105,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster /** * Parameter to specify the maximum radius of the neighborhood to be * considered, must be suitable to - * {@link AbstractDimensionsSelectingDoubleDistanceFunction}. + * {@link DimensionSelectingSubspaceDistanceFunction}. * <p> * Key: {@code -subclu.epsilon} * </p> @@ -125,7 +125,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * Holds the instance of the distance function specified by * {@link #DISTANCE_FUNCTION_ID}. */ - private AbstractDimensionsSelectingDoubleDistanceFunction<V> distanceFunction; + private DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction; /** * Holds the value of {@link #EPSILON_ID}. @@ -149,7 +149,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * @param epsilon Epsilon value * @param minpts Minpts value */ - public SUBCLU(AbstractDimensionsSelectingDoubleDistanceFunction<V> distanceFunction, DoubleDistance epsilon, int minpts) { + public SUBCLU(DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction, DoubleDistance epsilon, int minpts) { super(); this.distanceFunction = distanceFunction; this.epsilon = epsilon; @@ -168,49 +168,49 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null; // Generate all 1-dimensional clusters - if(stepprog != null) { + if (stepprog != null) { stepprog.beginStep(1, "Generate all 1-dimensional clusters.", LOG); } // mapping of dimensionality to set of subspaces - HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<Integer, List<Subspace>>(); + HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>(); // list of 1-dimensional subspaces containing clusters - List<Subspace> s_1 = new ArrayList<Subspace>(); + List<Subspace> s_1 = new ArrayList<>(); subspaceMap.put(0, s_1); // mapping of subspaces to list of clusters - TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<Subspace, List<Cluster<Model>>>(new Subspace.DimensionComparator()); + TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator()); - for(int d = 0; d < dimensionality; d++) { + for (int d = 0; d < dimensionality; d++) { Subspace currentSubspace = new Subspace(d); List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace); - if(LOG.isDebuggingFiner()) { + if (LOG.isDebuggingFiner()) { StringBuilder msg = new StringBuilder(); msg.append('\n').append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n"); - for(Cluster<Model> cluster : clusters) { + for (Cluster<Model> cluster : clusters) { msg.append(" " + cluster.getIDs() + "\n"); } LOG.debugFiner(msg.toString()); } - if(!clusters.isEmpty()) { + if (!clusters.isEmpty()) { s_1.add(currentSubspace); clusterMap.put(currentSubspace, clusters); } } // Generate (d+1)-dimensional clusters from d-dimensional clusters - for(int d = 0; d < dimensionality - 1; d++) { - if(stepprog != null) { + for (int d = 0; d < dimensionality - 1; d++) { + if (stepprog != null) { stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG); } List<Subspace> subspaces = subspaceMap.get(d); - if(subspaces == null || subspaces.isEmpty()) { - if(stepprog != null) { - for(int dim = d + 1; dim < dimensionality - 1; dim++) { + if (subspaces == null || subspaces.isEmpty()) { + if (stepprog != null) { + for (int dim = d + 1; dim < dimensionality - 1; dim++) { stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG); } } @@ -218,57 +218,57 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster } List<Subspace> candidates = generateSubspaceCandidates(subspaces); - List<Subspace> s_d = new ArrayList<Subspace>(); + List<Subspace> s_d = new ArrayList<>(); - for(Subspace candidate : candidates) { + for (Subspace candidate : candidates) { Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap); - if(LOG.isDebuggingFine()) { + if (LOG.isDebuggingFine()) { LOG.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString()); } List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace); - List<Cluster<Model>> clusters = new ArrayList<Cluster<Model>>(); - for(Cluster<Model> cluster : bestSubspaceClusters) { + List<Cluster<Model>> clusters = new ArrayList<>(); + for (Cluster<Model> cluster : bestSubspaceClusters) { List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate); - if(!candidateClusters.isEmpty()) { + if (!candidateClusters.isEmpty()) { clusters.addAll(candidateClusters); } } - if(LOG.isDebuggingFine()) { + if (LOG.isDebuggingFine()) { StringBuilder msg = new StringBuilder(); msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n"); - for(Cluster<Model> c : clusters) { + for (Cluster<Model> c : clusters) { msg.append(" " + c.getIDs() + "\n"); } LOG.debugFine(msg.toString()); } - if(!clusters.isEmpty()) { + if (!clusters.isEmpty()) { s_d.add(candidate); clusterMap.put(candidate, clusters); } } - if(!s_d.isEmpty()) { + if (!s_d.isEmpty()) { subspaceMap.put(d + 1, s_d); } } // build result int numClusters = 1; - result = new Clustering<SubspaceModel<V>>("SUBCLU clustering", "subclu-clustering"); - for(Subspace subspace : clusterMap.descendingKeySet()) { + result = new Clustering<>("SUBCLU clustering", "subclu-clustering"); + for (Subspace subspace : clusterMap.descendingKeySet()) { List<Cluster<Model>> clusters = clusterMap.get(subspace); - for(Cluster<Model> cluster : clusters) { - Cluster<SubspaceModel<V>> newCluster = new Cluster<SubspaceModel<V>>(cluster.getIDs()); - newCluster.setModel(new SubspaceModel<V>(subspace, Centroid.make(relation, cluster.getIDs()).toVector(relation))); + for (Cluster<Model> cluster : clusters) { + Cluster<SubspaceModel<V>> newCluster = new Cluster<>(cluster.getIDs()); + newCluster.setModel(new SubspaceModel<>(subspace, Centroid.make(relation, cluster.getIDs()).toVector(relation))); newCluster.setName("cluster_" + numClusters++); - result.addCluster(newCluster); + result.addToplevelCluster(newCluster); } } - if(stepprog != null) { + if (stepprog != null) { stepprog.setCompleted(LOG); } return result; @@ -300,7 +300,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster distanceFunction.setSelectedDimensions(subspace.getDimensions()); ProxyDatabase proxy; - if(ids == null) { + if (ids == null) { // TODO: in this case, we might want to use an index - the proxy below // will prevent this! ids = relation.getDBIDs(); @@ -308,18 +308,18 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster proxy = new ProxyDatabase(ids, relation); - DBSCAN<V, DoubleDistance> dbscan = new DBSCAN<V, DoubleDistance>(distanceFunction, epsilon, minpts); + DBSCAN<V, DoubleDistance> dbscan = new DBSCAN<>(distanceFunction, epsilon, minpts); // run DBSCAN - if(LOG.isVerbose()) { + if (LOG.isVerbose()) { LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString()); } Clustering<Model> dbsres = dbscan.run(proxy); // separate cluster and noise List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters(); - List<Cluster<Model>> clusters = new ArrayList<Cluster<Model>>(); - for(Cluster<Model> c : clusterAndNoise) { - if(!c.isNoise()) { + List<Cluster<Model>> clusters = new ArrayList<>(); + for (Cluster<Model> c : clusterAndNoise) { + if (!c.isNoise()) { clusters.add(c); } } @@ -334,9 +334,9 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster * @return the {@code d+1}-dimensional subspace candidates */ private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces) { - List<Subspace> candidates = new ArrayList<Subspace>(); + List<Subspace> candidates = new ArrayList<>(); - if(subspaces.isEmpty()) { + if (subspaces.isEmpty()) { return candidates; } @@ -344,46 +344,46 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster int d = subspaces.get(0).dimensionality(); StringBuilder msgFine = new StringBuilder("\n"); - if(LOG.isDebuggingFiner()) { + if (LOG.isDebuggingFiner()) { msgFine.append("subspaces ").append(subspaces).append('\n'); } - for(int i = 0; i < subspaces.size(); i++) { + for (int i = 0; i < subspaces.size(); i++) { Subspace s1 = subspaces.get(i); - for(int j = i + 1; j < subspaces.size(); j++) { + for (int j = i + 1; j < subspaces.size(); j++) { Subspace s2 = subspaces.get(j); Subspace candidate = s1.join(s2); - if(candidate != null) { - if(LOG.isDebuggingFiner()) { + if (candidate != null) { + if (LOG.isDebuggingFiner()) { msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n'); } // prune irrelevant candidate subspaces List<Subspace> lowerSubspaces = lowerSubspaces(candidate); - if(LOG.isDebuggingFiner()) { + if (LOG.isDebuggingFiner()) { msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n'); } boolean irrelevantCandidate = false; - for(Subspace s : lowerSubspaces) { - if(!subspaces.contains(s)) { + for (Subspace s : lowerSubspaces) { + if (!subspaces.contains(s)) { irrelevantCandidate = true; break; } } - if(!irrelevantCandidate) { + if (!irrelevantCandidate) { candidates.add(candidate); } } } } - if(LOG.isDebuggingFiner()) { + if (LOG.isDebuggingFiner()) { LOG.debugFiner(msgFine.toString()); } - if(LOG.isDebugging()) { + if (LOG.isDebugging()) { StringBuilder msg = new StringBuilder(); msg.append(d + 1).append("-dimensional candidate subspaces: "); - for(Subspace candidate : candidates) { + for (Subspace candidate : candidates) { msg.append(candidate.dimensonsToString()).append(' '); } LOG.debug(msg.toString()); @@ -401,14 +401,14 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster */ private List<Subspace> lowerSubspaces(Subspace subspace) { int dimensionality = subspace.dimensionality(); - if(dimensionality <= 1) { + if (dimensionality <= 1) { return null; } // order result according to the dimensions - List<Subspace> result = new ArrayList<Subspace>(); + List<Subspace> result = new ArrayList<>(); BitSet dimensions = subspace.getDimensions(); - for(int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) { + for (int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) { BitSet newDimensions = (BitSet) dimensions.clone(); newDimensions.set(dim, false); result.add(new Subspace(newDimensions)); @@ -432,14 +432,14 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster private Subspace bestSubspace(List<Subspace> subspaces, Subspace candidate, TreeMap<Subspace, List<Cluster<Model>>> clusterMap) { Subspace bestSubspace = null; - for(Subspace subspace : subspaces) { + for (Subspace subspace : subspaces) { int min = Integer.MAX_VALUE; - if(subspace.isSubspace(candidate)) { + if (subspace.isSubspace(candidate)) { List<Cluster<Model>> clusters = clusterMap.get(subspace); - for(Cluster<Model> cluster : clusters) { + for (Cluster<Model> cluster : clusters) { int clusterSize = cluster.size(); - if(clusterSize < min) { + if (clusterSize < min) { min = clusterSize; bestSubspace = subspace; } @@ -472,31 +472,31 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster protected DoubleDistance epsilon = null; - protected AbstractDimensionsSelectingDoubleDistanceFunction<V> distance = null; + protected DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distance = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>> param = new ObjectParameter<AbstractDimensionsSelectingDoubleDistanceFunction<V>>(DISTANCE_FUNCTION_ID, AbstractDimensionsSelectingDoubleDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); - if(config.grab(param)) { + ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance>> param = new ObjectParameter<>(DISTANCE_FUNCTION_ID, DimensionSelectingSubspaceDistanceFunction.class, SubspaceEuclideanDistanceFunction.class); + if (config.grab(param)) { distance = param.instantiateClass(config); } - DistanceParameter<DoubleDistance> epsilonP = new DistanceParameter<DoubleDistance>(EPSILON_ID, distance); - if(config.grab(epsilonP)) { + DistanceParameter<DoubleDistance> epsilonP = new DistanceParameter<>(EPSILON_ID, distance); + if (config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(new GreaterConstraint(0)); - if(config.grab(minptsP)) { + if (config.grab(minptsP)) { minpts = minptsP.getValue(); } } @Override protected SUBCLU<V> makeInstance() { - return new SUBCLU<V>(distance, epsilon, minpts); + return new SUBCLU<>(distance, epsilon, minpts); } } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java index 17eb3c19..561816bd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SubspaceClusteringAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java index 6b22b233..50e3fcd5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUESubspace.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -65,7 +65,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { */ public CLIQUESubspace(int dimension) { super(dimension); - denseUnits = new ArrayList<CLIQUEUnit<V>>(); + denseUnits = new ArrayList<>(); coverage = 0; } @@ -76,7 +76,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { */ public CLIQUESubspace(BitSet dimensions) { super(dimensions); - denseUnits = new ArrayList<CLIQUEUnit<V>>(); + denseUnits = new ArrayList<>(); coverage = 0; } @@ -104,12 +104,12 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { * @return the clusters in this subspace and the corresponding cluster models */ public List<Pair<Subspace, ModifiableDBIDs>> determineClusters() { - List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<Pair<Subspace, ModifiableDBIDs>>(); + List<Pair<Subspace, ModifiableDBIDs>> clusters = new ArrayList<>(); for(CLIQUEUnit<V> unit : getDenseUnits()) { if(!unit.isAssigned()) { ModifiableDBIDs cluster = DBIDUtil.newHashSet(); - CLIQUESubspace<V> model = new CLIQUESubspace<V>(getDimensions()); + CLIQUESubspace<V> model = new CLIQUESubspace<>(getDimensions()); clusters.add(new Pair<Subspace, ModifiableDBIDs>(model, cluster)); dfs(unit, cluster, model); } @@ -217,7 +217,7 @@ public class CLIQUESubspace<V extends NumberVector<?>> extends Subspace { return null; } - CLIQUESubspace<V> s = new CLIQUESubspace<V>(dimensions); + CLIQUESubspace<V> s = new CLIQUESubspace<>(dimensions); for(CLIQUEUnit<V> u1 : this.getDenseUnits()) { for(CLIQUEUnit<V> u2 : other.getDenseUnits()) { CLIQUEUnit<V> u = u1.join(u2, all, tau); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java index 70f251c9..a71b2b67 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/CLIQUEUnit.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -77,7 +77,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { public CLIQUEUnit(SortedSet<Interval> intervals, ModifiableDBIDs ids) { this.intervals = intervals; - dimensionToInterval = new TIntObjectHashMap<Interval>(); + dimensionToInterval = new TIntObjectHashMap<>(); for(Interval interval : intervals) { dimensionToInterval.put(interval.getDimension(), interval); } @@ -93,10 +93,10 @@ public class CLIQUEUnit<V extends NumberVector<?>> { * @param interval the interval belonging to this unit */ public CLIQUEUnit(Interval interval) { - intervals = new TreeSet<Interval>(); + intervals = new TreeSet<>(); intervals.add(interval); - dimensionToInterval = new TIntObjectHashMap<Interval>(); + dimensionToInterval = new TIntObjectHashMap<>(); dimensionToInterval.put(interval.getDimension(), interval); ids = DBIDUtil.newHashSet(); @@ -254,7 +254,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { Iterator<Interval> it1 = this.intervals.iterator(); Iterator<Interval> it2 = other.intervals.iterator(); - SortedSet<Interval> resultIntervals = new TreeSet<Interval>(); + SortedSet<Interval> resultIntervals = new TreeSet<>(); for(int i = 0; i < this.intervals.size() - 1; i++) { i1 = it1.next(); i2 = it2.next(); @@ -270,7 +270,7 @@ public class CLIQUEUnit<V extends NumberVector<?>> { resultIDs.retainAll(other.ids); if(resultIDs.size() / all >= tau) { - return new CLIQUEUnit<V>(resultIntervals, resultIDs); + return new CLIQUEUnit<>(resultIntervals, resultIDs); } return null; diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java index 7a686190..7acd7572 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/clique/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java index 2a1eb930..2efa038d 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/package-info.java @@ -10,7 +10,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java index af8fb1ea..3b5d0ec2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -144,7 +144,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl HashMap<String, DBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation); ModifiableDBIDs noiseids = DBIDUtil.newArray(); - Clustering<Model> result = new Clustering<Model>("By Label Clustering", "bylabel-clustering"); + Clustering<Model> result = new Clustering<>("By Label Clustering", "bylabel-clustering"); for(Entry<String, DBIDs> entry : labelMap.entrySet()) { DBIDs ids = entry.getValue(); if(ids.size() <= 1) { @@ -156,13 +156,13 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl if(noisepattern != null && noisepattern.matcher(entry.getKey()).find()) { c.setNoise(true); } - result.addCluster(c); + result.addToplevelCluster(c); } // Collected noise IDs. if(noiseids.size() > 0) { Cluster<Model> c = new Cluster<Model>("Noise", noiseids, ClusterModel.CLUSTER); c.setNoise(true); - result.addCluster(c); + result.addToplevelCluster(c); } return result; } @@ -175,7 +175,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl * @return a mapping of labels to ids */ private HashMap<String, DBIDs> singleAssignment(Relation<?> data) { - HashMap<String, DBIDs> labelMap = new HashMap<String, DBIDs>(); + HashMap<String, DBIDs> labelMap = new HashMap<>(); for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { final Object val = data.get(iditer); @@ -193,7 +193,7 @@ public class ByLabelClustering extends AbstractAlgorithm<Clustering<Model>> impl * @return a mapping of labels to ids */ private HashMap<String, DBIDs> multipleAssignment(Relation<?> data) { - HashMap<String, DBIDs> labelMap = new HashMap<String, DBIDs>(); + HashMap<String, DBIDs> labelMap = new HashMap<>(); for(DBIDIter iditer = data.iterDBIDs(); iditer.valid(); iditer.advance()) { String[] labels = data.get(iditer).toString().split(" "); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java index dfb7d37f..33101221 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelHierarchicalClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -88,8 +88,7 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< try { Relation<ClassLabel> relation = database.getRelation(TypeUtil.CLASSLABEL); return run(relation); - } - catch(NoSupportedDataTypeException e) { + } catch (NoSupportedDataTypeException e) { // Otherwise, try any labellike. return run(database.getRelation(getInputTypeRestriction()[0])); } @@ -101,12 +100,13 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< * @param relation The data input to use */ public Clustering<Model> run(Relation<?> relation) { - HashMap<String, DBIDs> labelmap = new HashMap<String, DBIDs>(); + HashMap<String, DBIDs> labelmap = new HashMap<>(); ModifiableDBIDs noiseids = DBIDUtil.newArray(); + Clustering<Model> clustering = new Clustering<>("By Label Hierarchical Clustering", "bylabel-clustering"); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { final Object val = relation.get(iditer); - if(val == null) { + if (val == null) { noiseids.add(iditer); continue; } @@ -115,44 +115,41 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< assign(labelmap, label, iditer); } - ArrayList<Cluster<Model>> clusters = new ArrayList<Cluster<Model>>(labelmap.size()); - for(Entry<String, DBIDs> entry : labelmap.entrySet()) { + ArrayList<Cluster<Model>> clusters = new ArrayList<>(labelmap.size()); + for (Entry<String, DBIDs> entry : labelmap.entrySet()) { DBIDs ids = entry.getValue(); - if(ids instanceof DBID) { + if (ids instanceof DBID) { noiseids.add((DBID) ids); continue; } - Cluster<Model> clus = new Cluster<Model>(entry.getKey(), ids, ClusterModel.CLUSTER, new ArrayList<Cluster<Model>>(), new ArrayList<Cluster<Model>>()); + Cluster<Model> clus = new Cluster<Model>(entry.getKey(), ids, ClusterModel.CLUSTER); clusters.add(clus); } - for(Cluster<Model> cur : clusters) { - for(Cluster<Model> oth : clusters) { - if(oth != cur) { - if(oth.getName().startsWith(cur.getName())) { - oth.getParents().add(cur); - cur.getChildren().add(oth); - // System.err.println(oth.getLabel() + " is a child of " + - // cur.getLabel()); + for (Cluster<Model> cur : clusters) { + boolean isrootcluster = true; + for (Cluster<Model> oth : clusters) { + if (oth != cur) { + if (oth.getName().startsWith(cur.getName())) { + clustering.addChildCluster(oth, cur); + if (LOG.isDebuggingFiner()) { + LOG.debugFiner(oth.getName() + " is a child of " + cur.getName()); + } + isrootcluster = false; } } } - } - ArrayList<Cluster<Model>> rootclusters = new ArrayList<Cluster<Model>>(); - for(Cluster<Model> cur : clusters) { - if(cur.getParents().size() == 0) { - rootclusters.add(cur); + if (isrootcluster) { + clustering.addToplevelCluster(cur); } } // Collected noise IDs. - if(noiseids.size() > 0) { + if (noiseids.size() > 0) { Cluster<Model> c = new Cluster<Model>("Noise", noiseids, ClusterModel.CLUSTER); c.setNoise(true); - rootclusters.add(c); + clustering.addToplevelCluster(c); } - assert (rootclusters.size() > 0) : "No clusters found by bylabel clustering. Empty database?"; - - return new Clustering<Model>("By Label Hierarchical Clustering", "bylabel-clustering", rootclusters); + return clustering; } /** @@ -163,21 +160,19 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< * @param id the id of the object to be assigned */ private void assign(HashMap<String, DBIDs> labelMap, String label, DBIDRef id) { - if(labelMap.containsKey(label)) { + if (labelMap.containsKey(label)) { DBIDs exist = labelMap.get(label); - if(exist instanceof DBID) { + if (exist instanceof DBID) { ModifiableDBIDs n = DBIDUtil.newHashSet(); n.add((DBID) exist); n.add(id); labelMap.put(label, n); - } - else { + } else { assert (exist instanceof HashSetModifiableDBIDs); assert (exist.size() > 1); ((ModifiableDBIDs) exist).add(id); } - } - else { + } else { labelMap.put(label, DBIDUtil.deref(id)); } } @@ -191,4 +186,4 @@ public class ByLabelHierarchicalClustering extends AbstractAlgorithm<Clustering< protected Logging getLogger() { return LOG; } -}
\ No newline at end of file +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java index f082db9c..76b024a2 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByLabelOrAllInOneClustering.java @@ -15,7 +15,7 @@ import de.lmu.ifi.dbs.elki.database.relation.Relation; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -66,9 +66,9 @@ public class ByLabelOrAllInOneClustering extends ByLabelClustering { // Ignore. } final DBIDs ids = database.getRelation(TypeUtil.ANY).getDBIDs(); - Clustering<Model> result = new Clustering<Model>("All-in-one trivial Clustering", "allinone-clustering"); + Clustering<Model> result = new Clustering<>("All-in-one trivial Clustering", "allinone-clustering"); Cluster<Model> c = new Cluster<Model>(ids, ClusterModel.CLUSTER); - result.addCluster(c); + result.addToplevelCluster(c); return result; } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java index 2114ac16..73ad9880 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/ByModelClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -101,7 +101,7 @@ public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> impl */ public Clustering<Model> run(Relation<Model> relation) { // Build model mapping - HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<Model, ModifiableDBIDs>(); + HashMap<Model, ModifiableDBIDs> modelMap = new HashMap<>(); for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { Model model = relation.get(iditer); ModifiableDBIDs modelids = modelMap.get(model); @@ -112,16 +112,16 @@ public class ByModelClustering extends AbstractAlgorithm<Clustering<Model>> impl modelids.add(iditer); } - Clustering<Model> result = new Clustering<Model>("By Model Clustering", "bymodel-clustering"); + Clustering<Model> result = new Clustering<>("By Model Clustering", "bymodel-clustering"); for(Entry<Model, ModifiableDBIDs> entry : modelMap.entrySet()) { final Model model = entry.getKey(); final ModifiableDBIDs ids = entry.getValue(); final String name = (model instanceof GeneratorInterface) ? ((GeneratorInterface) model).getName() : model.toString(); - Cluster<Model> c = new Cluster<Model>(name, ids, model); + Cluster<Model> c = new Cluster<>(name, ids, model); if(noisepattern != null && noisepattern.matcher(name).find()) { c.setNoise(true); } - result.addCluster(c); + result.addToplevelCluster(c); } return result; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java index eaa5d2b2..dae50c25 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllInOne.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -63,9 +63,9 @@ public class TrivialAllInOne extends AbstractAlgorithm<Clustering<Model>> implem public Clustering<Model> run(Relation<?> relation) { final DBIDs ids = relation.getDBIDs(); - Clustering<Model> result = new Clustering<Model>("All-in-one trivial Clustering", "allinone-clustering"); + Clustering<Model> result = new Clustering<>("All-in-one trivial Clustering", "allinone-clustering"); Cluster<Model> c = new Cluster<Model>(ids, ClusterModel.CLUSTER); - result.addCluster(c); + result.addToplevelCluster(c); return result; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java index dd0f94a5..ecc7dbec 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/TrivialAllNoise.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.trivial; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2012 + Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -62,9 +62,9 @@ public class TrivialAllNoise extends AbstractAlgorithm<Clustering<Model>> implem public Clustering<Model> run(Relation<?> relation) { final DBIDs ids = relation.getDBIDs(); - Clustering<Model> result = new Clustering<Model>("All-in-noise trivial Clustering", "allinnoise-clustering"); + Clustering<Model> result = new Clustering<>("All-in-noise trivial Clustering", "allinnoise-clustering"); Cluster<Model> c = new Cluster<Model>(ids, true, ClusterModel.CLUSTER); - result.addCluster(c); + result.addToplevelCluster(c); return result; } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java index 5870a736..6b7b50f5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/trivial/package-info.java @@ -7,7 +7,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures -Copyright (C) 2012 +Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |