summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/index/idistance/InMemoryIDistanceIndex.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/index/idistance/InMemoryIDistanceIndex.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/index/idistance/InMemoryIDistanceIndex.java559
1 files changed, 559 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/index/idistance/InMemoryIDistanceIndex.java b/src/de/lmu/ifi/dbs/elki/index/idistance/InMemoryIDistanceIndex.java
new file mode 100644
index 00000000..926e77ad
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/index/idistance/InMemoryIDistanceIndex.java
@@ -0,0 +1,559 @@
+package de.lmu.ifi.dbs.elki.index.idistance;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.KNNHeap;
+import de.lmu.ifi.dbs.elki.database.ids.KNNList;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.index.AbstractRefiningIndex;
+import de.lmu.ifi.dbs.elki.index.IndexFactory;
+import de.lmu.ifi.dbs.elki.index.KNNIndex;
+import de.lmu.ifi.dbs.elki.index.RangeIndex;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
+import de.lmu.ifi.dbs.elki.math.MeanVarianceMinMax;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
+
+/**
+ * In-memory iDistance index, a metric indexing method using a reference point
+ * embedding.
+ *
+ * <b>Important note:</b> we are currently using a different query strategy. The
+ * original publication discusses queries based on repeated <em>radius</em>
+ * queries. We use a strategy based on shrinking spheres, iteratively refined
+ * starting with the closes reference point. We also do not use a B+-tree as
+ * data structure, but simple in-memory lists. Therefore, we cannot report page
+ * accesses needed.
+ *
+ * Feel free to contribute improved query strategies. All the code is
+ * essentially here, you only need to query every reference point list, not just
+ * the best.
+ *
+ * Reference:
+ * <p>
+ * C. Yu, B. C. Ooi, K. L. Tan, H. V. Jagadish<br />
+ * Indexing the Distance: An Efficient Method to KNN Processing.<br />
+ * In Proceedings of the 27th International Conference on Very Large Data Bases
+ * </p>
+ *
+ * <p>
+ * H. V. Jagadish, B. C. Ooi, K. L. Tan, C. Yu, R. Zhang<br />
+ * iDistance: An adaptive B+-tree based indexing method for nearest neighbor
+ * search.<br />
+ * ACM Transactions on Database Systems (TODS), 30(2), 364-397.
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type
+ */
+@Reference(authors = "C. Yu, B. C. Ooi, K. L. Tan, H. V. Jagadish", title = "Indexing the distance: An efficient method to knn processing", booktitle = "In Proceedings of the 27th International Conference on Very Large Data Bases", url = "http://www.vldb.org/conf/2001/P421.pdf")
+public class InMemoryIDistanceIndex<O> extends AbstractRefiningIndex<O> implements RangeIndex<O>, KNNIndex<O> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(InMemoryIDistanceIndex.class);
+
+ /**
+ * Distance query.
+ */
+ private DistanceQuery<O> distanceQuery;
+
+ /**
+ * Initialization method.
+ */
+ private KMedoidsInitialization<O> initialization;
+
+ /**
+ * Number of reference points.
+ */
+ private int numref;
+
+ /**
+ * Reference points.
+ */
+ private ArrayDBIDs referencepoints;
+
+ /**
+ * The actual index.
+ */
+ private ModifiableDoubleDBIDList[] index;
+
+ /**
+ * Second reference, for documentation generation.
+ */
+ @Reference(authors = "H. V. Jagadish, B. C. Ooi, K. L. Tan, C. Yu, R. Zhang", title = "iDistance: An adaptive B+-tree based indexing method for nearest neighbor search", booktitle = "ACM Transactions on Database Systems (TODS), 30(2), 364-397")
+ public static final Void SECOND_REFERENCE = null;
+
+ /**
+ * Constructor.
+ *
+ * @param relation Data relation
+ * @param distance Distance
+ * @param initialization Initialization method
+ * @param numref Number of reference points
+ */
+ public InMemoryIDistanceIndex(Relation<O> relation, DistanceQuery<O> distance, KMedoidsInitialization<O> initialization, int numref) {
+ super(relation);
+ this.distanceQuery = distance;
+ this.initialization = initialization;
+ this.numref = numref;
+ if(!distance.getDistanceFunction().isMetric()) {
+ LOG.warning("iDistance assumes metric distance functions.\n" //
+ + distance.getDistanceFunction().getClass() + " does not report itself as metric.\n" //
+ + "iDistance will run, but may yield approximate results.");
+ }
+ }
+
+ @Override
+ public void initialize() {
+ referencepoints = DBIDUtil.ensureArray(initialization.chooseInitialMedoids(numref, relation.getDBIDs(), distanceQuery));
+ final int k = referencepoints.size(); // should be the same k anyway.
+ index = new ModifiableDoubleDBIDList[k];
+ for(int i = 0; i < k; i++) {
+ index[i] = DBIDUtil.newDistanceDBIDList(relation.size() / (2 * k));
+ }
+ // TODO: add optimized codepath for primitive distances.
+ DBIDArrayIter riter = referencepoints.iter();
+ for(DBIDIter oiter = relation.iterDBIDs(); oiter.valid(); oiter.advance()) {
+ double bestd = Double.POSITIVE_INFINITY;
+ int besti = -1;
+ for(riter.seek(0); riter.valid(); riter.advance()) {
+ double dist = distanceQuery.distance(oiter, riter);
+ if(dist < bestd) {
+ bestd = dist;
+ besti = riter.getOffset();
+ }
+ }
+ assert (besti >= 0 && besti < k);
+ index[besti].add(bestd, oiter);
+ }
+
+ // Sort index.
+ for(int i = 0; i < k; i++) {
+ index[i].sort();
+ }
+ }
+
+ @Override
+ public KNNQuery<O> getKNNQuery(DistanceQuery<O> distanceQuery, Object... hints) {
+ // Query on the relation we index
+ if(distanceQuery.getRelation() != relation) {
+ return null;
+ }
+ DistanceFunction<? super O> distanceFunction = (DistanceFunction<? super O>) distanceQuery.getDistanceFunction();
+ if(!this.getDistanceFunction().equals(distanceFunction)) {
+ if(LOG.isDebugging()) {
+ LOG.debug("Distance function not supported by index - or 'equals' not implemented right!");
+ }
+ return null;
+ }
+ return new IDistanceKNNQuery(distanceQuery);
+ }
+
+ @Override
+ public RangeQuery<O> getRangeQuery(DistanceQuery<O> distanceQuery, Object... hints) {
+ // Query on the relation we index
+ if(distanceQuery.getRelation() != relation) {
+ return null;
+ }
+ DistanceFunction<? super O> distanceFunction = (DistanceFunction<? super O>) distanceQuery.getDistanceFunction();
+ if(!this.getDistanceFunction().equals(distanceFunction)) {
+ if(LOG.isDebugging()) {
+ LOG.debug("Distance function not supported by index - or 'equals' not implemented right!");
+ }
+ return null;
+ }
+ return new IDistanceRangeQuery(distanceQuery);
+ }
+
+ /**
+ * Distance function.
+ *
+ * @return Distance function
+ */
+ private DistanceFunction<? super O> getDistanceFunction() {
+ return distanceQuery.getDistanceFunction();
+ }
+
+ @Override
+ public String getLongName() {
+ return "iDistance index";
+ }
+
+ @Override
+ public String getShortName() {
+ return "idistance-index";
+ }
+
+ @Override
+ public Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ public void logStatistics() {
+ super.logStatistics();
+ MeanVarianceMinMax mm = new MeanVarianceMinMax();
+ for(int i = 0; i < index.length; i++) {
+ mm.put(index[i].size());
+ }
+ LOG.statistics(new LongStatistic(InMemoryIDistanceIndex.class.getName() + ".size.min", (int) mm.getMin()));
+ LOG.statistics(new DoubleStatistic(InMemoryIDistanceIndex.class.getName() + ".size.mean", mm.getMean()));
+ LOG.statistics(new LongStatistic(InMemoryIDistanceIndex.class.getName() + ".size.max", (int) mm.getMax()));
+ }
+
+ /**
+ * Sort the reference points by distance to the query object
+ *
+ * @param distanceQuery Distance query
+ * @param obj Query object
+ * @param referencepoints Iterator for reference points
+ * @return Sorted array.
+ */
+ protected static <O> DoubleIntPair[] rankReferencePoints(DistanceQuery<O> distanceQuery, O obj, ArrayDBIDs referencepoints) {
+ DoubleIntPair[] priority = new DoubleIntPair[referencepoints.size()];
+ // Compute distances to reference points.
+ for(DBIDArrayIter iter = referencepoints.iter(); iter.valid(); iter.advance()) {
+ final int i = iter.getOffset();
+ final double dist = distanceQuery.distance(obj, iter);
+ priority[i] = new DoubleIntPair(dist, i);
+ }
+ Arrays.sort(priority);
+ return priority;
+ }
+
+ /**
+ * Seek an iterator to the desired position, using binary search.
+ *
+ * @param index Index to search
+ * @param iter Iterator
+ * @param val Distance to search to
+ */
+ protected static void binarySearch(ModifiableDoubleDBIDList index, DoubleDBIDListIter iter, double val) {
+ // Binary search. TODO: move this into the DoubleDBIDList class.
+ int left = 0, right = index.size();
+ while(left < right) {
+ final int mid = (left + right) >>> 1;
+ final double curd = iter.seek(mid).doubleValue();
+ if(val < curd) {
+ right = mid;
+ }
+ else if(val > curd) {
+ left = mid + 1;
+ }
+ else {
+ left = mid;
+ break;
+ }
+ }
+ if(left >= index.size()) {
+ --left;
+ }
+ iter.seek(left);
+ }
+
+ /**
+ * kNN query implementation.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ protected class IDistanceKNNQuery extends AbstractRefiningIndex<O>.AbstractKNNQuery {
+ /**
+ * Constructor.
+ *
+ * @param distanceQuery Distance query
+ */
+ public IDistanceKNNQuery(DistanceQuery<O> distanceQuery) {
+ super(distanceQuery);
+ }
+
+ @Override
+ public KNNList getKNNForObject(O obj, int k) {
+ DoubleIntPair[] priority = rankReferencePoints(distanceQuery, obj, referencepoints);
+ // Approximate kNN search. We do not check _every_ list.
+ KNNHeap heap = DBIDUtil.newHeap(k);
+
+ for(DoubleIntPair pair : priority) {
+ final ModifiableDoubleDBIDList nindex = index[pair.second];
+ final double refd = pair.first;
+
+ final DoubleDBIDListIter ifwd = nindex.iter(), ibwd = nindex.iter();
+ binarySearch(nindex, ibwd, refd);
+ ifwd.seek(ibwd.getOffset() + 1);
+
+ // This assumes a metric, as we exploit triangle inequality:
+ // Lower bound for candidates further from the reference object:
+ // d(candidate, reference) <= d(candidate, query) + d(query, reference)
+ // d(candidate, reference) - d(query, reference) <= d(candidate, query)
+ double lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
+ // Lower bound for candidates closer to the reference object:
+ // d(query, reference) <= d(query, candidate) + d(candidate, reference)
+ // d(query, reference) - d(candidate, reference) <= d(query, candidate)
+ double lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
+ // Current query radius.
+ double kdist = heap.getKNNDistance();
+ while(true) {
+ // Handle NaN carefully.
+ if(!(lbfwd <= kdist) && !(lbbwd <= kdist)) {
+ break;
+ }
+ // Careful: NaN handling: not NaN and not worse than fwd (may be NaN).
+ if(lbfwd <= kdist && !(lbfwd > lbbwd)) {
+ final double dist = refine(ifwd, obj);
+ if(dist <= kdist) {
+ heap.insert(dist, ifwd);
+ kdist = heap.getKNNDistance();
+ }
+ // Advance iterator:
+ ifwd.advance();
+ lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
+ }
+ if(lbbwd <= kdist && !(lbbwd > lbfwd)) {
+ final double dist = refine(ibwd, obj);
+ if(dist <= kdist) {
+ heap.insert(dist, ibwd);
+ kdist = heap.getKNNDistance();
+ }
+ // Retract iterator:
+ ibwd.retract();
+ lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
+ }
+ }
+ }
+
+ return heap.toKNNList();
+ }
+ }
+
+ /**
+ * Exact Range query implementation.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ protected class IDistanceRangeQuery extends AbstractRefiningIndex<O>.AbstractRangeQuery {
+ /**
+ * Constructor.
+ *
+ * @param distanceQuery Distance query
+ */
+ public IDistanceRangeQuery(DistanceQuery<O> distanceQuery) {
+ super(distanceQuery);
+ }
+
+ @Override
+ public DoubleDBIDList getRangeForObject(O obj, double range) {
+ DoubleIntPair[] priority = rankReferencePoints(distanceQuery, obj, referencepoints);
+ ModifiableDoubleDBIDList result = DBIDUtil.newDistanceDBIDList();
+ for(DoubleIntPair pair : priority) {
+ final ModifiableDoubleDBIDList nindex = index[pair.second];
+ final double refd = pair.first;
+
+ DoubleDBIDListIter ifwd = nindex.iter(), ibwd = nindex.iter();
+ binarySearch(nindex, ibwd, refd);
+ ifwd.seek(ibwd.getOffset() + 1);
+
+ // This assumes a metric, as we exploit triangle inequality:
+ // Lower bound for candidates further from the reference object:
+ // d(candidate, reference) <= d(candidate, query) + d(query, reference)
+ // d(candidate, reference) - d(query, reference) <= d(candidate, query)
+ double lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
+ // Lower bound for candidates closer to the reference object:
+ // d(query, reference) <= d(query, candidate) + d(candidate, reference)
+ // d(query, reference) - d(candidate, reference) <= d(query, candidate)
+ double lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
+ while(true) {
+ // Handle NaN carefully.
+ if(!(lbfwd <= range) && !(lbbwd <= range)) {
+ break;
+ }
+ // Careful: NaN handling: not NaN and not worse than fwd (may be NaN).
+ if(lbfwd <= range && !(lbfwd > lbbwd)) {
+ final double dist = refine(ifwd, obj);
+ if(dist <= range) {
+ result.add(dist, ifwd);
+ }
+ // Advance iterator:
+ ifwd.advance();
+ lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
+ }
+ if(lbbwd <= range && !(lbbwd > lbfwd)) {
+ final double dist = refine(ibwd, obj);
+ if(dist <= range) {
+ result.add(dist, ibwd);
+ }
+ // Retract iterator:
+ ibwd.retract();
+ lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
+ }
+ }
+ }
+ result.sort();
+ return result;
+ }
+ }
+
+ /**
+ * Index factory for iDistance indexes.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has InMemoryIDistanceIndex
+ *
+ * @param <V> Data type.
+ */
+ public static class Factory<V> implements IndexFactory<V, InMemoryIDistanceIndex<V>> {
+ /**
+ * Distance function to use.
+ */
+ DistanceFunction<? super V> distance;
+
+ /**
+ * Initialization method.
+ */
+ KMedoidsInitialization<V> initialization;
+
+ /**
+ * Number of reference points
+ */
+ int k;
+
+ /**
+ * Constructor.
+ *
+ * @param distance Distance function
+ * @param initialization Initialization method
+ * @param k Number of reference points
+ */
+ public Factory(DistanceFunction<? super V> distance, KMedoidsInitialization<V> initialization, int k) {
+ super();
+ this.distance = distance;
+ this.initialization = initialization;
+ this.k = k;
+ }
+
+ @Override
+ public InMemoryIDistanceIndex<V> instantiate(Relation<V> relation) {
+ return new InMemoryIDistanceIndex<>(relation, distance.instantiate(relation), initialization, k);
+ }
+
+ @Override
+ public TypeInformation getInputTypeRestriction() {
+ return distance.getInputTypeRestriction();
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> object type.
+ */
+ public static class Parameterizer<V> extends AbstractParameterizer {
+ /**
+ * Parameter for the distance function
+ */
+ public static final OptionID DISTANCE_ID = new OptionID("idistance.distance", "Distance function to build the index for.");
+
+ /**
+ * Initialization method.
+ */
+ public static final OptionID REFERENCE_ID = new OptionID("idistance.reference", "Method to choose the reference points.");
+
+ /**
+ * Number of reference points.
+ */
+ public static final OptionID K_ID = new OptionID("idistance.k", "Number of reference points to use.");
+
+ /**
+ * Distance function to use.
+ */
+ DistanceFunction<? super V> distance;
+
+ /**
+ * Initialization method.
+ */
+ KMedoidsInitialization<V> initialization;
+
+ /**
+ * Number of reference points
+ */
+ int k;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<DistanceFunction<? super V>> distanceP = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class);
+ if(config.grab(distanceP)) {
+ distance = distanceP.instantiateClass(config);
+ }
+
+ ObjectParameter<KMedoidsInitialization<V>> initializationP = new ObjectParameter<>(REFERENCE_ID, KMedoidsInitialization.class);
+ if(config.grab(initializationP)) {
+ initialization = initializationP.instantiateClass(config);
+ }
+
+ IntParameter kP = new IntParameter(K_ID)//
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.intValue();
+ }
+ }
+
+ @Override
+ protected InMemoryIDistanceIndex.Factory<V> makeInstance() {
+ return new InMemoryIDistanceIndex.Factory<>(distance, initialization, k);
+ }
+ }
+ }
+}