diff options
Diffstat (limited to 'src/tutorial/clustering')
6 files changed, 140 insertions, 160 deletions
diff --git a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering1.java b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering1.java index 7477c064..b1406274 100644 --- a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering1.java +++ b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering1.java @@ -4,7 +4,7 @@ package tutorial.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -46,7 +46,6 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.result.Result; @@ -69,7 +68,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * * @param <O> Object type */ -public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, Result> { +public class NaiveAgglomerativeHierarchicalClustering1<O> extends AbstractDistanceBasedAlgorithm<O, Result> { /** * Class logger */ @@ -86,7 +85,7 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan * @param distanceFunction Distance function to use * @param numclusters Number of clusters */ - public NaiveAgglomerativeHierarchicalClustering1(DistanceFunction<? super O, D> distanceFunction, int numclusters) { + public NaiveAgglomerativeHierarchicalClustering1(DistanceFunction<? super O> distanceFunction, int numclusters) { super(distanceFunction); this.numclusters = numclusters; } @@ -99,7 +98,7 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan * @return Clustering hierarchy */ public Result run(Database db, Relation<O> relation) { - DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction()); + DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); @@ -108,10 +107,10 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan // Compute the initial distance matrix. double[][] matrix = new double[size][size]; DBIDArrayIter ix = ids.iter(), iy = ids.iter(); - for (int x = 0; ix.valid(); x++, ix.advance()) { + for(int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); - for (int y = 0; y < x; y++, iy.advance()) { - final double dist = dq.distance(ix, iy).doubleValue(); + for(int y = 0; y < x; y++, iy.advance()) { + final double dist = dq.distance(ix, iy); matrix[x][y] = dist; matrix[y][x] = dist; } @@ -129,18 +128,18 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan // Repeat until everything merged, except the desired number of clusters: final int stop = size - numclusters; FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null; - for (int i = 0; i < stop; i++) { + for(int i = 0; i < stop; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; - for (int x = 0; x < size; x++) { - if (height[x] < Double.POSITIVE_INFINITY) { + for(int x = 0; x < size; x++) { + if(height[x] < Double.POSITIVE_INFINITY) { continue; } - for (int y = 0; y < x; y++) { - if (height[y] < Double.POSITIVE_INFINITY) { + for(int y = 0; y < x; y++) { + if(height[y] < Double.POSITIVE_INFINITY) { continue; } - if (matrix[x][y] < min) { + if(matrix[x][y] < min) { min = matrix[x][y]; minx = x; miny = y; @@ -158,36 +157,33 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan // Merge into cluster ModifiableDBIDs cx = clusters.get(minx); ModifiableDBIDs cy = clusters.get(miny); - if (cy == null) { + if(cy == null) { cy = DBIDUtil.newHashSet(); cy.add(iy); } - if (cx == null) { + if(cx == null) { cy.add(ix); - } else { + } + else { cy.addDBIDs(cx); clusters.remove(minx); } clusters.put(miny, cy); // Update distance matrix for y: - for (int j = 0; j < size; j++) { + for(int j = 0; j < size; j++) { matrix[j][miny] = Math.min(matrix[j][minx], matrix[j][miny]); matrix[miny][j] = Math.min(matrix[minx][j], matrix[miny][j]); } - if (prog != null) { - prog.incrementProcessed(LOG); - } - } - if (prog != null) { - prog.ensureCompleted(LOG); + LOG.incrementProcessed(prog); } + LOG.ensureCompleted(prog); // Build the clustering result final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering"); - for (int x = 0; x < size; x++) { - if (height[x] < Double.POSITIVE_INFINITY) { + for(int x = 0; x < size; x++) { + if(height[x] < Double.POSITIVE_INFINITY) { DBIDs cids = clusters.get(x); - if (cids == null) { + if(cids == null) { ix.seek(x); cids = DBIDUtil.deref(ix); } @@ -214,10 +210,11 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan * * @author Erich Schubert * + * @apiviz.exclude + * * @param <O> Object type - * @param <D> Distance type */ - public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { /** * Desired number of clusters. */ @@ -228,13 +225,13 @@ public class NaiveAgglomerativeHierarchicalClustering1<O, D extends NumberDistan super.makeOptions(config); IntParameter numclustersP = new IntParameter(ExtractFlatClusteringFromHierarchy.Parameterizer.MINCLUSTERS_ID); numclustersP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); - if (config.grab(numclustersP)) { + if(config.grab(numclustersP)) { numclusters = numclustersP.intValue(); } } @Override - protected NaiveAgglomerativeHierarchicalClustering1<O, D> makeInstance() { + protected NaiveAgglomerativeHierarchicalClustering1<O> makeInstance() { return new NaiveAgglomerativeHierarchicalClustering1<>(distanceFunction, numclusters); } } diff --git a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering2.java b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering2.java index d33ca768..0081a845 100644 --- a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering2.java +++ b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering2.java @@ -4,7 +4,7 @@ package tutorial.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -46,7 +46,6 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.result.Result; @@ -69,7 +68,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * * @param <O> Object type */ -public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, Result> { +public class NaiveAgglomerativeHierarchicalClustering2<O> extends AbstractDistanceBasedAlgorithm<O, Result> { /** * Class logger */ @@ -86,7 +85,7 @@ public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistan * @param distanceFunction Distance function to use * @param numclusters Number of clusters */ - public NaiveAgglomerativeHierarchicalClustering2(DistanceFunction<? super O, D> distanceFunction, int numclusters) { + public NaiveAgglomerativeHierarchicalClustering2(DistanceFunction<? super O> distanceFunction, int numclusters) { super(distanceFunction); this.numclusters = numclusters; } @@ -99,7 +98,7 @@ public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistan * @return Clustering hierarchy */ public Result run(Database db, Relation<O> relation) { - DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction()); + DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); @@ -116,7 +115,7 @@ public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistan for (int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); for (int y = 0; y < x; y++, iy.advance()) { - scratch[pos] = dq.distance(ix, iy).doubleValue(); + scratch[pos] = dq.distance(ix, iy); pos++; } } @@ -200,13 +199,9 @@ public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistan final int jbase = triangleSize(j); scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]); } - if (prog != null) { - prog.incrementProcessed(LOG); - } - } - if (prog != null) { - prog.ensureCompleted(LOG); + LOG.incrementProcessed(prog); } + LOG.ensureCompleted(prog); // Build the clustering result final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering"); @@ -251,10 +246,11 @@ public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistan * * @author Erich Schubert * + * @apiviz.exclude + * * @param <O> Object type - * @param <D> Distance type */ - public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { /** * Desired number of clusters. */ @@ -271,7 +267,7 @@ public class NaiveAgglomerativeHierarchicalClustering2<O, D extends NumberDistan } @Override - protected NaiveAgglomerativeHierarchicalClustering2<O, D> makeInstance() { + protected NaiveAgglomerativeHierarchicalClustering2<O> makeInstance() { return new NaiveAgglomerativeHierarchicalClustering2<>(distanceFunction, numclusters); } } diff --git a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering3.java b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering3.java index d3312115..0b1590bd 100644 --- a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering3.java +++ b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering3.java @@ -4,7 +4,7 @@ package tutorial.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -47,7 +47,6 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.result.Result; @@ -81,7 +80,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; * @param <O> Object type */ @Reference(title = "A Review of Classification", authors = "R. M. Cormack", booktitle = "Journal of the Royal Statistical Society. Series A, Vol. 134, No. 3", url = "http://www.jstor.org/stable/2344237") -public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, Result> { +public class NaiveAgglomerativeHierarchicalClustering3<O> extends AbstractDistanceBasedAlgorithm<O, Result> { /** * Class logger */ @@ -94,6 +93,8 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan * R. M. Cormack, A Review of Classification * * @author Erich Schubert + * + * @apiviz.exclude */ public static enum Linkage {// SINGLE { @@ -168,7 +169,7 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan * @param numclusters Number of clusters * @param linkage Linkage strategy */ - public NaiveAgglomerativeHierarchicalClustering3(DistanceFunction<? super O, D> distanceFunction, int numclusters, Linkage linkage) { + public NaiveAgglomerativeHierarchicalClustering3(DistanceFunction<? super O> distanceFunction, int numclusters, Linkage linkage) { super(distanceFunction); this.numclusters = numclusters; this.linkage = linkage; @@ -182,14 +183,14 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan * @return Clustering hierarchy */ public Result run(Database db, Relation<O> relation) { - DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction()); + DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); - if (size > 0x10000) { + if(size > 0x10000) { throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } - if (Linkage.SINGLE.equals(linkage)) { + if(Linkage.SINGLE.equals(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } @@ -199,12 +200,12 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan // Position counter - must agree with computeOffset! int pos = 0; boolean square = Linkage.WARD.equals(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); - for (int x = 0; ix.valid(); x++, ix.advance()) { + for(int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); - for (int y = 0; y < x; y++, iy.advance()) { - scratch[pos] = dq.distance(ix, iy).doubleValue(); + for(int y = 0; y < x; y++, iy.advance()) { + scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values - if (square) { + if(square) { scratch[pos] *= scratch[pos]; } pos++; @@ -223,20 +224,20 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan // Repeat until everything merged, except the desired number of clusters: final int stop = size - numclusters; FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null; - for (int i = 0; i < stop; i++) { + for(int i = 0; i < stop; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; - for (int x = 0; x < size; x++) { - if (height[x] < Double.POSITIVE_INFINITY) { + for(int x = 0; x < size; x++) { + if(height[x] < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(x); - for (int y = 0; y < x; y++) { - if (height[y] < Double.POSITIVE_INFINITY) { + for(int y = 0; y < x; y++) { + if(height[y] < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + y; - if (scratch[idx] < min) { + if(scratch[idx] < min) { min = scratch[idx]; minx = x; miny = y; @@ -255,15 +256,17 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan ModifiableDBIDs cx = clusters.get(minx); ModifiableDBIDs cy = clusters.get(miny); int sizex = 1, sizey = 1; // cluster sizes, for averaging - if (cy == null) { + if(cy == null) { cy = DBIDUtil.newHashSet(); cy.add(iy); - } else { + } + else { sizey = cy.size(); } - if (cx == null) { + if(cx == null) { cy.add(ix); - } else { + } + else { sizex = cx.size(); cy.addDBIDs(cx); clusters.remove(minx); @@ -276,8 +279,8 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan // hashmap lookup. final int xbase = triangleSize(minx), ybase = triangleSize(miny); // Write to (y, j), with j < y - for (int j = 0; j < miny; j++) { - if (height[j] < Double.POSITIVE_INFINITY) { + for(int j = 0; j < miny; j++) { + if(height[j] < Double.POSITIVE_INFINITY) { continue; } final DBIDs idsj = clusters.get(j); @@ -285,8 +288,8 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan scratch[ybase + j] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[ybase + j], sizej, min); } // Write to (j, y), with y < j < x - for (int j = miny + 1; j < minx; j++) { - if (height[j] < Double.POSITIVE_INFINITY) { + for(int j = miny + 1; j < minx; j++) { + if(height[j] < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(j); @@ -295,8 +298,8 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[jbase + miny], sizej, min); } // Write to (j, y), with y < x < j - for (int j = minx + 1; j < size; j++) { - if (height[j] < Double.POSITIVE_INFINITY) { + for(int j = minx + 1; j < size; j++) { + if(height[j] < Double.POSITIVE_INFINITY) { continue; } final DBIDs idsj = clusters.get(j); @@ -304,20 +307,16 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan final int jbase = triangleSize(j); scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min); } - if (prog != null) { - prog.incrementProcessed(LOG); - } - } - if (prog != null) { - prog.ensureCompleted(LOG); + LOG.incrementProcessed(prog); } + LOG.ensureCompleted(prog); // Build the clustering result final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering"); - for (int x = 0; x < size; x++) { - if (height[x] < Double.POSITIVE_INFINITY) { + for(int x = 0; x < size; x++) { + if(height[x] < Double.POSITIVE_INFINITY) { DBIDs cids = clusters.get(x); - if (cids == null) { + if(cids == null) { ix.seek(x); cids = DBIDUtil.deref(ix); } @@ -355,10 +354,11 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan * * @author Erich Schubert * + * @apiviz.exclude + * * @param <O> Object type - * @param <D> Distance type */ - public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { /** * Option ID for linkage parameter. */ @@ -379,19 +379,19 @@ public class NaiveAgglomerativeHierarchicalClustering3<O, D extends NumberDistan super.makeOptions(config); IntParameter numclustersP = new IntParameter(ExtractFlatClusteringFromHierarchy.Parameterizer.MINCLUSTERS_ID); numclustersP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); - if (config.grab(numclustersP)) { + if(config.grab(numclustersP)) { numclusters = numclustersP.intValue(); } EnumParameter<Linkage> linkageP = new EnumParameter<>(LINKAGE_ID, Linkage.class); linkageP.setDefaultValue(Linkage.WARD); - if (config.grab(linkageP)) { + if(config.grab(linkageP)) { linkage = linkageP.getValue(); } } @Override - protected NaiveAgglomerativeHierarchicalClustering3<O, D> makeInstance() { + protected NaiveAgglomerativeHierarchicalClustering3<O> makeInstance() { return new NaiveAgglomerativeHierarchicalClustering3<>(distanceFunction, numclusters, linkage); } } diff --git a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering4.java b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering4.java index 23678ccc..997b657a 100644 --- a/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering4.java +++ b/src/tutorial/clustering/NaiveAgglomerativeHierarchicalClustering4.java @@ -4,7 +4,7 @@ package tutorial.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -33,7 +33,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; @@ -43,8 +43,6 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -75,7 +73,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; * @param <O> Object type */ @Reference(title = "A Review of Classification", authors = "R. M. Cormack", booktitle = "Journal of the Royal Statistical Society. Series A, Vol. 134, No. 3", url = "http://www.jstor.org/stable/2344237") -public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, PointerHierarchyRepresentationResult<DoubleDistance>> implements HierarchicalClusteringAlgorithm<DoubleDistance> { +public class NaiveAgglomerativeHierarchicalClustering4<O> extends AbstractDistanceBasedAlgorithm<O, PointerHierarchyRepresentationResult> implements HierarchicalClusteringAlgorithm { /** * Class logger */ @@ -88,6 +86,8 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan * R. M. Cormack, A Review of Classification * * @author Erich Schubert + * + * @apiviz.exclude */ public static enum Linkage {// SINGLE { @@ -156,7 +156,7 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan * @param distanceFunction Distance function to use * @param linkage Linkage strategy */ - public NaiveAgglomerativeHierarchicalClustering4(DistanceFunction<? super O, D> distanceFunction, Linkage linkage) { + public NaiveAgglomerativeHierarchicalClustering4(DistanceFunction<? super O> distanceFunction, Linkage linkage) { super(distanceFunction); this.linkage = linkage; } @@ -168,15 +168,15 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan * @param relation Relation * @return Clustering hierarchy */ - public PointerHierarchyRepresentationResult<DoubleDistance> run(Database db, Relation<O> relation) { - DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction()); + public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) { + DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); - if (size > 0x10000) { + if(size > 0x10000) { throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } - if (Linkage.SINGLE.equals(linkage)) { + if(Linkage.SINGLE.equals(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } @@ -186,12 +186,12 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan // Position counter - must agree with computeOffset! int pos = 0; boolean square = Linkage.WARD.equals(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); - for (int x = 0; ix.valid(); x++, ix.advance()) { + for(int x = 0; ix.valid(); x++, ix.advance()) { iy.seek(0); - for (int y = 0; y < x; y++, iy.advance()) { - scratch[pos] = dq.distance(ix, iy).doubleValue(); + for(int y = 0; y < x; y++, iy.advance()) { + scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values - if (square) { + if(square) { scratch[pos] *= scratch[pos]; } pos++; @@ -200,9 +200,9 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan // Initialize space for result: WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); - WritableDoubleDistanceDataStore height = DataStoreUtil.makeDoubleDistanceStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); + WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { parent.put(it, it); height.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); @@ -210,20 +210,20 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan // Repeat until everything merged, except the desired number of clusters: FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; - for (int i = 1; i < size; i++) { + for(int i = 1; i < size; i++) { double min = Double.POSITIVE_INFINITY; int minx = -1, miny = -1; - for (ix.seek(0); ix.valid(); ix.advance()) { - if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) { + for(ix.seek(0); ix.valid(); ix.advance()) { + if(height.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); - for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { - if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) { + for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { + if(height.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); - if (scratch[idx] <= min) { + if(scratch[idx] <= min) { min = scratch[idx]; minx = ix.getOffset(); miny = iy.getOffset(); @@ -244,16 +244,16 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan // Update distance matrix. Note: miny < minx final int xbase = triangleSize(minx), ybase = triangleSize(miny); // Write to (y, j), with j < y - for (ij.seek(0); ij.getOffset() < miny; ij.advance()) { - if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { + for(ij.seek(0); ij.getOffset() < miny; ij.advance()) { + if(height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); scratch[ybase + ij.getOffset()] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min); } // Write to (j, y), with y < j < x - for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) { - if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { + for(ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) { + if(height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); @@ -261,23 +261,19 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min); } // Write to (j, y), with y < x < j - for (ij.seek(minx + 1); ij.valid(); ij.advance()) { - if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) { + for(ij.seek(minx + 1); ij.valid(); ij.advance()) { + if(height.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min); } - if (prog != null) { - prog.incrementProcessed(LOG); - } - } - if (prog != null) { - prog.ensureCompleted(LOG); + LOG.incrementProcessed(prog); } + LOG.ensureCompleted(prog); - return new PointerHierarchyRepresentationResult<>(ids, parent, height); + return new PointerHierarchyRepresentationResult(ids, parent, height); } /** @@ -301,20 +297,16 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan return LOG; } - @Override - public DoubleDistance getDistanceFactory() { - return DoubleDistance.FACTORY; - } - /** * Parameterization class * * @author Erich Schubert * + * @apiviz.exclude + * * @param <O> Object type - * @param <D> Distance type */ - public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { /** * Option ID for linkage parameter. */ @@ -330,13 +322,13 @@ public class NaiveAgglomerativeHierarchicalClustering4<O, D extends NumberDistan super.makeOptions(config); EnumParameter<Linkage> linkageP = new EnumParameter<>(LINKAGE_ID, Linkage.class); linkageP.setDefaultValue(Linkage.WARD); - if (config.grab(linkageP)) { + if(config.grab(linkageP)) { linkage = linkageP.getValue(); } } @Override - protected NaiveAgglomerativeHierarchicalClustering4<O, D> makeInstance() { + protected NaiveAgglomerativeHierarchicalClustering4<O> makeInstance() { return new NaiveAgglomerativeHierarchicalClustering4<>(distanceFunction, linkage); } } diff --git a/src/tutorial/clustering/SameSizeKMeansAlgorithm.java b/src/tutorial/clustering/SameSizeKMeansAlgorithm.java index 91da44c5..ad57885e 100644 --- a/src/tutorial/clustering/SameSizeKMeansAlgorithm.java +++ b/src/tutorial/clustering/SameSizeKMeansAlgorithm.java @@ -4,7 +4,7 @@ package tutorial.clustering; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -29,8 +29,8 @@ import java.util.Comparator; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.AbstractKMeans; -import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansInitialization; -import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansPlusPlusInitialMeans; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMeansPlusPlusInitialMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -48,12 +48,11 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; @@ -78,7 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @param <V> Vector type */ -public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends AbstractKMeans<V, DoubleDistance, MeanModel<V>> { +public class SameSizeKMeansAlgorithm<V extends NumberVector> extends AbstractKMeans<V, MeanModel> { /** * Class logger */ @@ -92,7 +91,7 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract * @param maxiter Maximum number of iterations * @param initializer */ - public SameSizeKMeansAlgorithm(PrimitiveDoubleDistanceFunction<? super NumberVector<?>> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) { + public SameSizeKMeansAlgorithm(PrimitiveDistanceFunction<? super NumberVector> distanceFunction, int k, int maxiter, KMeansInitialization<? super V> initializer) { super(distanceFunction, k, maxiter, initializer); } @@ -104,11 +103,11 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract * @return result */ @Override - public Clustering<MeanModel<V>> run(Database database, Relation<V> relation) { + public Clustering<MeanModel> run(Database database, Relation<V> relation) { // Database objects to process final DBIDs ids = relation.getDBIDs(); // Choose initial means - List<? extends NumberVector<?>> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction()); + List<Vector> means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction(), Vector.FACTORY); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for(int i = 0; i < k; i++) { @@ -125,11 +124,10 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract means = refineResult(relation, means, clusters, metas, tids); // Wrap result - Clustering<MeanModel<V>> result = new Clustering<>("k-Means Samesize Clustering", "kmeans-samesize-clustering"); - final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation); + Clustering<MeanModel> result = new Clustering<>("k-Means Samesize Clustering", "kmeans-samesize-clustering"); for(int i = 0; i < clusters.size(); i++) { - V mean = factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()); - result.addToplevelCluster(new Cluster<>(clusters.get(i), new MeanModel<>(mean))); + Vector mean = means.get(i).getColumnVector(); + result.addToplevelCluster(new Cluster<>(clusters.get(i), new MeanModel(mean))); } return result; } @@ -141,10 +139,8 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract * @param means Mean vectors * @return Initialized storage */ - protected WritableDataStore<Meta> initializeMeta(Relation<V> relation, List<? extends NumberVector<?>> means) { - // This is a safe cast - see constructor. - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<NumberVector<?>>) getDistanceFunction(); + protected WritableDataStore<Meta> initializeMeta(Relation<V> relation, List<? extends NumberVector> means) { + PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction(); // The actual storage final WritableDataStore<Meta> metas = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Meta.class); // Build the metadata, track the two nearest cluster centers. @@ -152,7 +148,7 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract Meta c = new Meta(k); V fv = relation.get(id); for(int i = 0; i < k; i++) { - c.dists[i] = df.doubleDistance(fv, means.get(i)); + c.dists[i] = df.distance(fv, means.get(i)); if(i > 0) { if(c.dists[i] < c.dists[c.primary]) { c.primary = i; @@ -231,14 +227,14 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract * @param metas Metadata storage * @param df Distance function */ - protected void updateDistances(Relation<V> relation, List<? extends NumberVector<?>> means, final WritableDataStore<Meta> metas, PrimitiveDoubleDistanceFunction<NumberVector<?>> df) { + protected void updateDistances(Relation<V> relation, List<Vector> means, final WritableDataStore<Meta> metas, PrimitiveDistanceFunction<? super NumberVector> df) { for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) { Meta c = metas.get(id); V fv = relation.get(id); // Update distances to means. c.secondary = -1; for(int i = 0; i < k; i++) { - c.dists[i] = df.doubleDistance(fv, means.get(i)); + c.dists[i] = df.distance(fv, means.get(i)); if(c.primary != i) { if(c.secondary < 0 || c.dists[i] < c.dists[c.secondary]) { c.secondary = i; @@ -259,10 +255,9 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract * @param tids DBIDs array * @return final means */ - protected List<? extends NumberVector<?>> refineResult(Relation<V> relation, List<? extends NumberVector<?>> means, List<ModifiableDBIDs> clusters, final WritableDataStore<Meta> metas, ArrayModifiableDBIDs tids) { + protected List<Vector> refineResult(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, final WritableDataStore<Meta> metas, ArrayModifiableDBIDs tids) { // This is a safe cast - see constructor. - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<NumberVector<?>>) getDistanceFunction(); + PrimitiveDistanceFunction<? super NumberVector> df = getDistanceFunction(); // Our desired cluster size: final int minsize = tids.size() / k; // rounded down final int maxsize = (tids.size() + k - 1) / k; // rounded up @@ -289,7 +284,7 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract transfers[i] = DBIDUtil.newArray(); } - for(int iter = 0; maxiter < 0 || iter < maxiter; iter++) { + for(int iter = 0; maxiter <= 0 || iter < maxiter; iter++) { updateDistances(relation, means, metas, df); tids.sort(comp); int active = 0; // Track if anything has changed @@ -461,7 +456,7 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { /** * k Parameter. */ @@ -480,12 +475,12 @@ public class SameSizeKMeansAlgorithm<V extends NumberVector<?>> extends Abstract /** * Distance function */ - protected PrimitiveDoubleDistanceFunction<? super NumberVector<?>> distanceFunction; + protected PrimitiveDistanceFunction<? super NumberVector> distanceFunction; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<PrimitiveDoubleDistanceFunction<? super NumberVector<?>>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDoubleDistanceFunction.class); + ObjectParameter<PrimitiveDistanceFunction<? super NumberVector>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class); if(config.grab(distanceFunctionP)) { distanceFunction = distanceFunctionP.instantiateClass(config); if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) { diff --git a/src/tutorial/clustering/package-info.java b/src/tutorial/clustering/package-info.java index ac7ab950..e2971ef8 100644 --- a/src/tutorial/clustering/package-info.java +++ b/src/tutorial/clustering/package-info.java @@ -5,7 +5,7 @@ This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team |