package de.lmu.ifi.dbs.elki.index.idistance;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.Arrays;
import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
import de.lmu.ifi.dbs.elki.database.ids.KNNHeap;
import de.lmu.ifi.dbs.elki.database.ids.KNNList;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.index.AbstractRefiningIndex;
import de.lmu.ifi.dbs.elki.index.IndexFactory;
import de.lmu.ifi.dbs.elki.index.KNNIndex;
import de.lmu.ifi.dbs.elki.index.RangeIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.math.MeanVarianceMinMax;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
/**
* In-memory iDistance index, a metric indexing method using a reference point
* embedding.
*
* Important note: we are currently using a different query strategy. The
* original publication discusses queries based on repeated radius
* queries. We use a strategy based on shrinking spheres, iteratively refined
* starting with the closes reference point. We also do not use a B+-tree as
* data structure, but simple in-memory lists. Therefore, we cannot report page
* accesses needed.
*
* Feel free to contribute improved query strategies. All the code is
* essentially here, you only need to query every reference point list, not just
* the best.
*
* Reference:
*
* C. Yu, B. C. Ooi, K. L. Tan, H. V. Jagadish
* Indexing the Distance: An Efficient Method to KNN Processing.
* In Proceedings of the 27th International Conference on Very Large Data Bases
*
*
*
* H. V. Jagadish, B. C. Ooi, K. L. Tan, C. Yu, R. Zhang
* iDistance: An adaptive B+-tree based indexing method for nearest neighbor
* search.
* ACM Transactions on Database Systems (TODS), 30(2), 364-397.
*
*
* @author Erich Schubert
* @since 0.7.0
*
* @param Object type
*/
@Reference(authors = "C. Yu, B. C. Ooi, K. L. Tan, H. V. Jagadish", title = "Indexing the distance: An efficient method to knn processing", booktitle = "In Proceedings of the 27th International Conference on Very Large Data Bases", url = "http://www.vldb.org/conf/2001/P421.pdf")
public class InMemoryIDistanceIndex extends AbstractRefiningIndex implements RangeIndex, KNNIndex {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(InMemoryIDistanceIndex.class);
/**
* Distance query.
*/
private DistanceQuery distanceQuery;
/**
* Initialization method.
*/
private KMedoidsInitialization initialization;
/**
* Number of reference points.
*/
private int numref;
/**
* Reference points.
*/
private ArrayDBIDs referencepoints;
/**
* The actual index.
*/
private ModifiableDoubleDBIDList[] index;
/**
* Second reference, for documentation generation.
*/
@Reference(authors = "H. V. Jagadish, B. C. Ooi, K. L. Tan, C. Yu, R. Zhang", title = "iDistance: An adaptive B+-tree based indexing method for nearest neighbor search", booktitle = "ACM Transactions on Database Systems (TODS), 30(2), 364-397")
public static final Void SECOND_REFERENCE = null;
/**
* Constructor.
*
* @param relation Data relation
* @param distance Distance
* @param initialization Initialization method
* @param numref Number of reference points
*/
public InMemoryIDistanceIndex(Relation relation, DistanceQuery distance, KMedoidsInitialization initialization, int numref) {
super(relation);
this.distanceQuery = distance;
this.initialization = initialization;
this.numref = numref;
if(!distance.getDistanceFunction().isMetric()) {
LOG.warning("iDistance assumes metric distance functions.\n" //
+ distance.getDistanceFunction().getClass() + " does not report itself as metric.\n" //
+ "iDistance will run, but may yield approximate results.");
}
}
@Override
public void initialize() {
referencepoints = DBIDUtil.ensureArray(initialization.chooseInitialMedoids(numref, relation.getDBIDs(), distanceQuery));
final int k = referencepoints.size(); // should be the same k anyway.
index = new ModifiableDoubleDBIDList[k];
for(int i = 0; i < k; i++) {
index[i] = DBIDUtil.newDistanceDBIDList(relation.size() / (2 * k));
}
// TODO: add optimized codepath for primitive distances.
DBIDArrayIter riter = referencepoints.iter();
for(DBIDIter oiter = relation.iterDBIDs(); oiter.valid(); oiter.advance()) {
double bestd = Double.POSITIVE_INFINITY;
int besti = -1;
for(riter.seek(0); riter.valid(); riter.advance()) {
double dist = distanceQuery.distance(oiter, riter);
if(dist < bestd) {
bestd = dist;
besti = riter.getOffset();
}
}
assert (besti >= 0 && besti < k);
index[besti].add(bestd, oiter);
}
// Sort index.
for(int i = 0; i < k; i++) {
index[i].sort();
}
}
@Override
public KNNQuery getKNNQuery(DistanceQuery distanceQuery, Object... hints) {
// Query on the relation we index
if(distanceQuery.getRelation() != relation) {
return null;
}
DistanceFunction super O> distanceFunction = (DistanceFunction super O>) distanceQuery.getDistanceFunction();
if(!this.getDistanceFunction().equals(distanceFunction)) {
if(LOG.isDebugging()) {
LOG.debug("Distance function not supported by index - or 'equals' not implemented right!");
}
return null;
}
return new IDistanceKNNQuery(distanceQuery);
}
@Override
public RangeQuery getRangeQuery(DistanceQuery distanceQuery, Object... hints) {
// Query on the relation we index
if(distanceQuery.getRelation() != relation) {
return null;
}
DistanceFunction super O> distanceFunction = (DistanceFunction super O>) distanceQuery.getDistanceFunction();
if(!this.getDistanceFunction().equals(distanceFunction)) {
if(LOG.isDebugging()) {
LOG.debug("Distance function not supported by index - or 'equals' not implemented right!");
}
return null;
}
return new IDistanceRangeQuery(distanceQuery);
}
/**
* Distance function.
*
* @return Distance function
*/
private DistanceFunction super O> getDistanceFunction() {
return distanceQuery.getDistanceFunction();
}
@Override
public String getLongName() {
return "iDistance index";
}
@Override
public String getShortName() {
return "idistance-index";
}
@Override
public Logging getLogger() {
return LOG;
}
@Override
public void logStatistics() {
super.logStatistics();
MeanVarianceMinMax mm = new MeanVarianceMinMax();
for(int i = 0; i < index.length; i++) {
mm.put(index[i].size());
}
LOG.statistics(new LongStatistic(InMemoryIDistanceIndex.class.getName() + ".size.min", (int) mm.getMin()));
LOG.statistics(new DoubleStatistic(InMemoryIDistanceIndex.class.getName() + ".size.mean", mm.getMean()));
LOG.statistics(new LongStatistic(InMemoryIDistanceIndex.class.getName() + ".size.max", (int) mm.getMax()));
}
/**
* Sort the reference points by distance to the query object
*
* @param distanceQuery Distance query
* @param obj Query object
* @param referencepoints Iterator for reference points
* @return Sorted array.
*/
protected static DoubleIntPair[] rankReferencePoints(DistanceQuery distanceQuery, O obj, ArrayDBIDs referencepoints) {
DoubleIntPair[] priority = new DoubleIntPair[referencepoints.size()];
// Compute distances to reference points.
for(DBIDArrayIter iter = referencepoints.iter(); iter.valid(); iter.advance()) {
final int i = iter.getOffset();
final double dist = distanceQuery.distance(obj, iter);
priority[i] = new DoubleIntPair(dist, i);
}
Arrays.sort(priority);
return priority;
}
/**
* Seek an iterator to the desired position, using binary search.
*
* @param index Index to search
* @param iter Iterator
* @param val Distance to search to
*/
protected static void binarySearch(ModifiableDoubleDBIDList index, DoubleDBIDListIter iter, double val) {
// Binary search. TODO: move this into the DoubleDBIDList class.
int left = 0, right = index.size();
while(left < right) {
final int mid = (left + right) >>> 1;
final double curd = iter.seek(mid).doubleValue();
if(val < curd) {
right = mid;
}
else if(val > curd) {
left = mid + 1;
}
else {
left = mid;
break;
}
}
if(left >= index.size()) {
--left;
}
iter.seek(left);
}
/**
* kNN query implementation.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected class IDistanceKNNQuery extends AbstractRefiningIndex.AbstractKNNQuery {
/**
* Constructor.
*
* @param distanceQuery Distance query
*/
public IDistanceKNNQuery(DistanceQuery distanceQuery) {
super(distanceQuery);
}
@Override
public KNNList getKNNForObject(O obj, int k) {
DoubleIntPair[] priority = rankReferencePoints(distanceQuery, obj, referencepoints);
// Approximate kNN search. We do not check _every_ list.
KNNHeap heap = DBIDUtil.newHeap(k);
for(DoubleIntPair pair : priority) {
final ModifiableDoubleDBIDList nindex = index[pair.second];
final double refd = pair.first;
final DoubleDBIDListIter ifwd = nindex.iter(), ibwd = nindex.iter();
binarySearch(nindex, ibwd, refd);
ifwd.seek(ibwd.getOffset() + 1);
// This assumes a metric, as we exploit triangle inequality:
// Lower bound for candidates further from the reference object:
// d(candidate, reference) <= d(candidate, query) + d(query, reference)
// d(candidate, reference) - d(query, reference) <= d(candidate, query)
double lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
// Lower bound for candidates closer to the reference object:
// d(query, reference) <= d(query, candidate) + d(candidate, reference)
// d(query, reference) - d(candidate, reference) <= d(query, candidate)
double lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
// Current query radius.
double kdist = heap.getKNNDistance();
while(true) {
// Handle NaN carefully.
if(!(lbfwd <= kdist) && !(lbbwd <= kdist)) {
break;
}
// Careful: NaN handling: not NaN and not worse than fwd (may be NaN).
if(lbfwd <= kdist && !(lbfwd > lbbwd)) {
final double dist = refine(ifwd, obj);
if(dist <= kdist) {
heap.insert(dist, ifwd);
kdist = heap.getKNNDistance();
}
// Advance iterator:
ifwd.advance();
lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
}
if(lbbwd <= kdist && !(lbbwd > lbfwd)) {
final double dist = refine(ibwd, obj);
if(dist <= kdist) {
heap.insert(dist, ibwd);
kdist = heap.getKNNDistance();
}
// Retract iterator:
ibwd.retract();
lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
}
}
}
return heap.toKNNList();
}
}
/**
* Exact Range query implementation.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected class IDistanceRangeQuery extends AbstractRefiningIndex.AbstractRangeQuery {
/**
* Constructor.
*
* @param distanceQuery Distance query
*/
public IDistanceRangeQuery(DistanceQuery distanceQuery) {
super(distanceQuery);
}
@Override
public void getRangeForObject(O obj, double range, ModifiableDoubleDBIDList result) {
DoubleIntPair[] priority = rankReferencePoints(distanceQuery, obj, referencepoints);
for(DoubleIntPair pair : priority) {
final ModifiableDoubleDBIDList nindex = index[pair.second];
final double refd = pair.first;
DoubleDBIDListIter ifwd = nindex.iter(), ibwd = nindex.iter();
binarySearch(nindex, ibwd, refd);
ifwd.seek(ibwd.getOffset() + 1);
// This assumes a metric, as we exploit triangle inequality:
// Lower bound for candidates further from the reference object:
// d(candidate, reference) <= d(candidate, query) + d(query, reference)
// d(candidate, reference) - d(query, reference) <= d(candidate, query)
double lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
// Lower bound for candidates closer to the reference object:
// d(query, reference) <= d(query, candidate) + d(candidate, reference)
// d(query, reference) - d(candidate, reference) <= d(query, candidate)
double lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
while(true) {
// Handle NaN carefully.
if(!(lbfwd <= range) && !(lbbwd <= range)) {
break;
}
// Careful: NaN handling: not NaN and not worse than fwd (may be NaN).
if(lbfwd <= range && !(lbfwd > lbbwd)) {
final double dist = refine(ifwd, obj);
if(dist <= range) {
result.add(dist, ifwd);
}
// Advance iterator:
ifwd.advance();
lbfwd = ifwd.valid() ? Math.abs(ifwd.doubleValue() - refd) : Double.NaN;
}
if(lbbwd <= range && !(lbbwd > lbfwd)) {
final double dist = refine(ibwd, obj);
if(dist <= range) {
result.add(dist, ibwd);
}
// Retract iterator:
ibwd.retract();
lbbwd = ibwd.valid() ? Math.abs(ibwd.doubleValue() - refd) : Double.NaN;
}
}
}
}
}
/**
* Index factory for iDistance indexes.
*
* @author Erich Schubert
*
* @apiviz.has InMemoryIDistanceIndex
*
* @param Data type.
*/
public static class Factory implements IndexFactory> {
/**
* Distance function to use.
*/
DistanceFunction super V> distance;
/**
* Initialization method.
*/
KMedoidsInitialization initialization;
/**
* Number of reference points
*/
int k;
/**
* Constructor.
*
* @param distance Distance function
* @param initialization Initialization method
* @param k Number of reference points
*/
public Factory(DistanceFunction super V> distance, KMedoidsInitialization initialization, int k) {
super();
this.distance = distance;
this.initialization = initialization;
this.k = k;
}
@Override
public InMemoryIDistanceIndex instantiate(Relation relation) {
return new InMemoryIDistanceIndex<>(relation, distance.instantiate(relation), initialization, k);
}
@Override
public TypeInformation getInputTypeRestriction() {
return distance.getInputTypeRestriction();
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*
* @param object type.
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* Parameter for the distance function
*/
public static final OptionID DISTANCE_ID = new OptionID("idistance.distance", "Distance function to build the index for.");
/**
* Initialization method.
*/
public static final OptionID REFERENCE_ID = new OptionID("idistance.reference", "Method to choose the reference points.");
/**
* Number of reference points.
*/
public static final OptionID K_ID = new OptionID("idistance.k", "Number of reference points to use.");
/**
* Distance function to use.
*/
DistanceFunction super V> distance;
/**
* Initialization method.
*/
KMedoidsInitialization initialization;
/**
* Number of reference points
*/
int k;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter> distanceP = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class);
if(config.grab(distanceP)) {
distance = distanceP.instantiateClass(config);
}
ObjectParameter> initializationP = new ObjectParameter<>(REFERENCE_ID, KMedoidsInitialization.class);
if(config.grab(initializationP)) {
initialization = initializationP.instantiateClass(config);
}
IntParameter kP = new IntParameter(K_ID)//
.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
k = kP.intValue();
}
}
@Override
protected InMemoryIDistanceIndex.Factory makeInstance() {
return new InMemoryIDistanceIndex.Factory<>(distance, initialization, k);
}
}
}
}