diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/KNNDistancesSampler.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/KNNDistancesSampler.java | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistancesSampler.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistancesSampler.java new file mode 100644 index 00000000..42cfef2c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistancesSampler.java @@ -0,0 +1,268 @@ +package de.lmu.ifi.dbs.elki.algorithm; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Arrays; +import java.util.Iterator; + +import de.lmu.ifi.dbs.elki.algorithm.KNNDistancesSampler.KNNDistanceOrderResult; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.KNNList; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.result.BasicResult; +import de.lmu.ifi.dbs.elki.result.IterableResult; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * Provides an order of the kNN-distances for all objects within the database. + * + * This class can be used to estimate parameters for other algorithms, such as + * estimating the epsilon parameter for DBSCAN: set k to minPts-1, and then + * choose a percentile from the sample as epsilon, or plot the result as a graph + * and look for a bend or knee in this plot. + * + * @author Arthur Zimek + * + * @param <O> the type of objects handled by this Algorithm + */ +@Title("KNN-Distance-Order") +@Description("Assesses the knn distances for a specified k and orders them.") +public class KNNDistancesSampler<O> extends AbstractDistanceBasedAlgorithm<O, KNNDistanceOrderResult> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(KNNDistancesSampler.class); + + /** + * Parameter k. + */ + protected int k; + + /** + * Sampling percentage. + */ + protected double sample; + + /** + * Random number seeding. + */ + private RandomFactory rnd; + + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param k k Parameter + * @param sample Sampling rate, or sample size (when > 1) + * @param rnd Random source. + */ + public KNNDistancesSampler(DistanceFunction<O> distanceFunction, int k, double sample, RandomFactory rnd) { + super(distanceFunction); + this.k = k; + this.sample = sample; + this.rnd = rnd; + } + + /** + * Provides an order of the kNN-distances for all objects within the specified + * database. + * + * @param database Database + * @param relation Relation + * @return Result + */ + public KNNDistanceOrderResult run(Database database, Relation<O> relation) { + final DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction()); + final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, k + 1); + + final int size = (int) ((sample <= 1.) ? Math.ceil(relation.size() * sample) : sample); + DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), size, rnd); + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling kNN distances.", size, LOG) : null; + double[] knnDistances = new double[size]; + int i = 0; + for(DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance(), i++) { + final KNNList neighbors = knnQuery.getKNNForDBID(iditer, k + 1); + knnDistances[i] = neighbors.getKNNDistance(); + LOG.incrementProcessed(prog); + } + Arrays.sort(knnDistances); + LOG.ensureCompleted(prog); + return new KNNDistanceOrderResult("kNN distances sample", "knn-distances", knnDistances); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Wraps a list containing the knn distances. + * + * @author Arthur Zimek + * + * @apiviz.exclude + */ + public static class KNNDistanceOrderResult extends BasicResult implements IterableResult<Double> { + /** + * Store the kNN Distances + */ + private final double[] knnDistances; + + /** + * Construct result + * + * @param name The long name (for pretty printing) + * @param shortname the short name (for filenames etc.) + * @param knnDistances distance list to wrap. + */ + public KNNDistanceOrderResult(String name, String shortname, final double[] knnDistances) { + super(name, shortname); + this.knnDistances = knnDistances; + } + + /** + * Return an iterator. + */ + @Override + public Iterator<Double> iterator() { + return new Iterator<Double>() { + int pos = 0; + + @Override + public boolean hasNext() { + return (pos < knnDistances.length); + } + + @Override + public Double next() { + return knnDistances[pos++]; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { + /** + * Parameter to specify the distance of the k-distant object to be assessed, + * must be an integer greater than 0. + */ + public static final OptionID K_ID = new OptionID("knndistanceorder.k", "Specifies the distance of the k-distant object to be assessed, ignoring the query object."); + + /** + * Parameter to specify the average percentage of distances randomly chosen + * to be provided in the result, must be a double greater than 0 and less + * than or equal to 1. + */ + public static final OptionID SAMPLING_ID = new OptionID("knndistanceorder.sample", "The percentage of objects to use for sampling, or the absolute number of samples."); + + /** + * Random generator seed for distances. + */ + public static final OptionID SEED_ID = new OptionID("knndistanceorder.seed", "Random generator seed for sampling."); + + /** + * Parameter k. + */ + protected int k; + + /** + * Sampling percentage. + */ + protected double percentage; + + /** + * Random number seeding. + */ + private RandomFactory rnd; + + /** + * Constructor. + */ + public Parameterizer() { + super(); + } + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + IntParameter kP = new IntParameter(K_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(kP)) { + k = kP.getValue(); + } + + DoubleParameter percentageP = new DoubleParameter(SAMPLING_ID, 1.) // + .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + if(config.grab(percentageP)) { + percentage = percentageP.getValue(); + } + + RandomParameter randomP = new RandomParameter(SEED_ID); + if(config.grab(randomP)) { + rnd = randomP.getValue(); + } + } + + @Override + protected KNNDistancesSampler<O> makeInstance() { + return new KNNDistancesSampler<>(distanceFunction, k, percentage, rnd); + } + } +}
\ No newline at end of file |