diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/statistics')
5 files changed, 159 insertions, 55 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java new file mode 100644 index 00000000..481261b3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java @@ -0,0 +1,97 @@ +package de.lmu.ifi.dbs.elki.algorithm.statistics; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2012 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.algorithm.Algorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.scales.LinearScale; +import de.lmu.ifi.dbs.elki.result.Result; +import de.lmu.ifi.dbs.elki.result.ResultUtil; +import de.lmu.ifi.dbs.elki.result.ScalesResult; +import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; + +/** + * Pseudo "algorith" that computes the global min/max for a relation across all + * attributes. + * + * @author Erich Schubert + */ +@Description("Setup a scaling so that all dimensions are scaled equally in visualization.") +public class AddSingleScale implements Algorithm { + /** + * Constructor. + */ + public AddSingleScale() { + super(); + } + + @SuppressWarnings("unchecked") + @Override + public Result run(Database database) { + for(Relation<?> rel : database.getRelations()) { + if(TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(rel.getDataTypeInformation())) { + ScalesResult res = run((Relation<? extends NumberVector<?, ?>>) rel); + ResultUtil.addChildResult(rel, res); + } + } + return null; + } + + /** + * Add scales to a single vector relation. + * + * @param rel Relation + * @return Scales + */ + private ScalesResult run(Relation<? extends NumberVector<?, ?>> rel) { + final int dim = DatabaseUtil.dimensionality(rel); + DoubleMinMax minmax = new DoubleMinMax(); + for(DBIDIter iditer = rel.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id = iditer.getDBID(); + NumberVector<?, ?> vec = rel.get(id); + for(int d = 1; d <= dim; d++) { + minmax.put(vec.doubleValue(d)); + } + } + LinearScale scale = new LinearScale(minmax.getMin(), minmax.getMax()); + LinearScale[] scales = new LinearScale[dim]; + for(int i = 0; i < dim; i++) { + scales[i] = scale; + } + ScalesResult res = new ScalesResult(scales); + return res; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } +}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java index 1c74621b..f6f1d16f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java @@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; @@ -101,7 +102,7 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ? } @Override - public HistogramResult<DoubleVector> run(Database database) throws IllegalStateException { + public HistogramResult<DoubleVector> run(Database database) { final Relation<V> relation = database.getRelation(getInputTypeRestriction()[0]); final Relation<Object> lrelation = database.getRelation(getInputTypeRestriction()[1]); final DistanceQuery<V, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); @@ -122,7 +123,8 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ? } FiniteProgress objloop = logger.isVerbose() ? new FiniteProgress("Computing nearest neighbors", ids.size(), logger) : null; // sort neighbors - for(DBID id : ids) { + for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + DBID id = iter.getDBID(); KNNResult<D> knn = knnQuery.getKNNForDBID(id, k); Object label = lrelation.get(id); diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java index 78bbf5f4..d6ce6a15 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java @@ -31,7 +31,7 @@ import java.util.Random; import java.util.TreeSet; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.model.Model; @@ -40,6 +40,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; @@ -66,7 +67,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; -import de.lmu.ifi.dbs.elki.utilities.pairs.FCPair; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** @@ -131,11 +131,8 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex this.sampling = sampling; } - /** - * Iterates over all points in the database. - */ @Override - public HistogramResult<DoubleVector> run(Database database) throws IllegalStateException { + public HistogramResult<DoubleVector> run(Database database) { final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]); final DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); @@ -145,7 +142,7 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex DoubleMinMax gminmax = new DoubleMinMax(); // Cluster by labels - Collection<Cluster<Model>> split = (new ByLabelClustering()).run(database).getAllClusters(); + Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); // global in-cluster min/max DoubleMinMax giminmax = new DoubleMinMax(); @@ -184,12 +181,14 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex final Pair<Long, Long> incFirst = new Pair<Long, Long>(1L, 0L); final Pair<Long, Long> incSecond = new Pair<Long, Long>(0L, 1L); for(Cluster<?> c1 : split) { - for(DBID id1 : c1.getIDs()) { + for(DBIDIter iter = c1.getIDs().iter(); iter.valid(); iter.advance()) { + DBID id1 = iter.getDBID(); // in-cluster distances DoubleMinMax iminmax = new DoubleMinMax(); - for(DBID id2 : c1.getIDs()) { + for(DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); // skip the point itself. - if(id1 == id2) { + if(id1.sameDBID(id2)) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); @@ -212,9 +211,10 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex if(c2 == c1) { continue; } - for(DBID id2 : c2.getIDs()) { + for(DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); // skip the point itself (shouldn't happen though) - if(id1 == id2) { + if(id1.sameDBID(id2)) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); @@ -255,8 +255,6 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex onum += ppair.getSecond().getSecond(); } long bnum = inum + onum; - // Note: when full sampling is added, this assertion won't hold anymore. - assert (bnum == relation.size() * (relation.size() - 1)); Collection<DoubleVector> binstat = new ArrayList<DoubleVector>(numbin); for(DoubleObjPair<Pair<Long, Long>> ppair : histogram) { @@ -285,58 +283,62 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex Random rnd = new Random(); // estimate minimum and maximum. int k = (int) Math.max(25, Math.pow(database.size(), 0.2)); - TreeSet<FCPair<Double, DBID>> minhotset = new TreeSet<FCPair<Double, DBID>>(); - TreeSet<FCPair<Double, DBID>> maxhotset = new TreeSet<FCPair<Double, DBID>>(Collections.reverseOrder()); + TreeSet<DoubleObjPair<DBID>> minhotset = new TreeSet<DoubleObjPair<DBID>>(); + TreeSet<DoubleObjPair<DBID>> maxhotset = new TreeSet<DoubleObjPair<DBID>>(Collections.reverseOrder()); int randomsize = (int) Math.max(25, Math.pow(database.size(), 0.2)); double rprob = ((double) randomsize) / size; ArrayModifiableDBIDs randomset = DBIDUtil.newArray(randomsize); - Iterator<DBID> iter = database.iterDBIDs(); - if(!iter.hasNext()) { + DBIDIter iter = database.iterDBIDs(); + if(!iter.valid()) { throw new IllegalStateException(ExceptionMessages.DATABASE_EMPTY); } - DBID firstid = iter.next(); - minhotset.add(new FCPair<Double, DBID>(Double.MAX_VALUE, firstid)); - maxhotset.add(new FCPair<Double, DBID>(Double.MIN_VALUE, firstid)); - while(iter.hasNext()) { - DBID id1 = iter.next(); + DBID firstid = iter.getDBID(); + iter.advance(); + minhotset.add(new DoubleObjPair<DBID>(Double.MAX_VALUE, firstid)); + maxhotset.add(new DoubleObjPair<DBID>(Double.MIN_VALUE, firstid)); + while(iter.valid()) { + DBID id1 = iter.getDBID(); + iter.advance(); // generate candidates for min distance. - ArrayList<FCPair<Double, DBID>> np = new ArrayList<FCPair<Double, DBID>>(k * 2 + randomsize * 2); - for(FCPair<Double, DBID> pair : minhotset) { + ArrayList<DoubleObjPair<DBID>> np = new ArrayList<DoubleObjPair<DBID>>(k * 2 + randomsize * 2); + for(DoubleObjPair<DBID> pair : minhotset) { DBID id2 = pair.getSecond(); // skip the object itself if(id1.compareTo(id2) == 0) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new FCPair<Double, DBID>(d, id1)); - np.add(new FCPair<Double, DBID>(d, id2)); + np.add(new DoubleObjPair<DBID>(d, id1)); + np.add(new DoubleObjPair<DBID>(d, id2)); } - for(DBID id2 : randomset) { + for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new FCPair<Double, DBID>(d, id1)); - np.add(new FCPair<Double, DBID>(d, id2)); + np.add(new DoubleObjPair<DBID>(d, id1)); + np.add(new DoubleObjPair<DBID>(d, id2)); } minhotset.addAll(np); shrinkHeap(minhotset, k); // generate candidates for max distance. - ArrayList<FCPair<Double, DBID>> np2 = new ArrayList<FCPair<Double, DBID>>(k * 2 + randomsize * 2); - for(FCPair<Double, DBID> pair : minhotset) { + ArrayList<DoubleObjPair<DBID>> np2 = new ArrayList<DoubleObjPair<DBID>>(k * 2 + randomsize * 2); + for(DoubleObjPair<DBID> pair : minhotset) { DBID id2 = pair.getSecond(); // skip the object itself if(id1.compareTo(id2) == 0) { continue; } double d = distFunc.distance(id1, id2).doubleValue(); - np2.add(new FCPair<Double, DBID>(d, id1)); - np2.add(new FCPair<Double, DBID>(d, id2)); + np2.add(new DoubleObjPair<DBID>(d, id1)); + np2.add(new DoubleObjPair<DBID>(d, id2)); } - for(DBID id2 : randomset) { + for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + DBID id2 = iter2.getDBID(); double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new FCPair<Double, DBID>(d, id1)); - np.add(new FCPair<Double, DBID>(d, id2)); + np.add(new DoubleObjPair<DBID>(d, id1)); + np.add(new DoubleObjPair<DBID>(d, id2)); } maxhotset.addAll(np2); shrinkHeap(maxhotset, k); @@ -349,14 +351,16 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), id1); } } - return new DoubleMinMax(minhotset.first().getFirst(), maxhotset.first().getFirst()); + return new DoubleMinMax(minhotset.first().first, maxhotset.first().first); } private DoubleMinMax exactMinMax(Relation<O> database, DistanceQuery<O, D> distFunc) { DoubleMinMax minmax = new DoubleMinMax(); // find exact minimum and maximum first. - for(DBID id1 : database.iterDBIDs()) { - for(DBID id2 : database.iterDBIDs()) { + for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { + DBID id1 = iditer.getDBID(); + for(DBIDIter iditer2 = database.iterDBIDs(); iditer2.valid(); iditer2.advance()) { + DBID id2 = iditer2.getDBID(); // skip the point itself. if(id1.compareTo(id2) == 0) { continue; @@ -368,12 +372,12 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex return minmax; } - private void shrinkHeap(TreeSet<FCPair<Double, DBID>> hotset, int k) { + private void shrinkHeap(TreeSet<DoubleObjPair<DBID>> hotset, int k) { // drop duplicates ModifiableDBIDs seenids = DBIDUtil.newHashSet(2 * k); int cnt = 0; - for(Iterator<FCPair<Double, DBID>> i = hotset.iterator(); i.hasNext();) { - FCPair<Double, DBID> p = i.next(); + for(Iterator<DoubleObjPair<DBID>> i = hotset.iterator(); i.hasNext();) { + DoubleObjPair<DBID> p = i.next(); if(cnt > k || seenids.contains(p.getSecond())) { i.remove(); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java index c1eb118d..353c1b02 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java @@ -29,7 +29,7 @@ import java.util.Collections; import java.util.HashMap; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -39,6 +39,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; @@ -114,11 +115,8 @@ public class EvaluateRankingQuality<V extends NumberVector<V, ?>, D extends Numb */ int numbins = 20; - /** - * Run the algorithm. - */ @Override - public HistogramResult<DoubleVector> run(Database database) throws IllegalStateException { + public HistogramResult<DoubleVector> run(Database database) { final Relation<V> relation = database.getRelation(getInputTypeRestriction()[0]); final DistanceQuery<V, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); final KNNQuery<V, D> knnQuery = database.getKNNQuery(distQuery, relation.size()); @@ -127,7 +125,7 @@ public class EvaluateRankingQuality<V extends NumberVector<V, ?>, D extends Numb logger.verbose("Preprocessing clusters..."); } // Cluster by labels - Collection<Cluster<Model>> split = (new ByLabelClustering()).run(database).getAllClusters(); + Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); // Compute cluster averages and covariance matrix HashMap<Cluster<?>, V> averages = new HashMap<Cluster<?>, V>(split.size()); @@ -150,7 +148,8 @@ public class EvaluateRankingQuality<V extends NumberVector<V, ?>, D extends Numb Vector av = averages.get(clus).getColumnVector(); Matrix covm = covmats.get(clus); - for(DBID i1 : clus.getIDs()) { + for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { + DBID i1 = iter.getDBID(); Double d = MathUtil.mahalanobisDistance(covm, av.minus(relation.get(i1).getColumnVector())); cmem.add(new FCPair<Double, DBID>(d, i1)); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java index 6d64dc55..4305bbca 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java @@ -27,7 +27,7 @@ import java.util.ArrayList; import java.util.Collection; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering; +import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.model.Model; @@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNResult; @@ -107,7 +108,7 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends logger.verbose("Preprocessing clusters..."); } // Cluster by labels - Collection<Cluster<Model>> split = (new ByLabelClustering()).run(database).getAllClusters(); + Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters(); AggregatingHistogram<Double, Double> hist = AggregatingHistogram.DoubleSumHistogram(numbins, 0.0, 1.0); @@ -119,7 +120,8 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends MeanVariance mv = new MeanVariance(); // sort neighbors for(Cluster<?> clus : split) { - for(DBID i1 : clus.getIDs()) { + for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) { + DBID i1 = iter.getDBID(); KNNResult<D> knn = knnQuery.getKNNForDBID(i1, relation.size()); double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn); |