package de.lmu.ifi.dbs.elki.index.tree.metrical.covertree; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; import de.lmu.ifi.dbs.elki.database.ids.KNNHeap; import de.lmu.ifi.dbs.elki.database.ids.KNNList; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.AbstractDistanceKNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.range.AbstractDistanceRangeQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.index.KNNIndex; import de.lmu.ifi.dbs.elki.index.RangeIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleObjectMinHeap; /** * Simplified cover tree data structure (in-memory). This is a metrical * data structure that is similar to the M-tree, but not as balanced and * disk-oriented. However, by not having these requirements it does not require * the expensive splitting procedures of M-tree. * * This version does not store the distance to the parent, so it needs only * about 40% of the memory of {@link CoverTree} but does more distance * computations for search. * * Reference: *

* A. Beygelzimer, S. Kakade, J. Langford
* Cover trees for nearest neighbor
* In Proc. 23rd International Conference on Machine Learning (ICML). *

* * TODO: allow insertions and removals, as in the original publication. * * @author Erich Schubert * @since 0.7.0 * * @apiviz.has CoverTreeRangeQuery * @apiviz.has CoverTreeKNNQuery */ public class SimplifiedCoverTree extends AbstractCoverTreeimplements RangeIndex, KNNIndex { /** * Class logger. */ private static final Logging LOG = Logging.getLogger(SimplifiedCoverTree.class); /** * Tree root. */ private Node root = null; /** * Constructor. * * @param relation data relation * @param distanceFunction distance function * @param expansion Expansion rate * @param truncate Truncate branches with less than this number of instances. */ public SimplifiedCoverTree(Relation relation, DistanceFunction distanceFunction, double expansion, int truncate) { super(relation, distanceFunction, expansion, truncate); } /** * Node object. * * @author Erich Schubert * * @apiviz.exclude */ private static final class Node { /** * Objects in this node. Except for the first, which is the routing object. */ ArrayModifiableDBIDs singletons; /** * Maximum distance to descendants. */ double maxDist = 0.; /** * Child nodes. */ ArrayList children; /** * Constructor. * * @param r Object. * @param maxDist Maximum distance to any descendant. */ public Node(DBIDRef r, double maxDist) { this.singletons = DBIDUtil.newArray(); this.singletons.add(r); this.children = new ArrayList<>(); this.maxDist = maxDist; } /** * Constructor for leaf node. * * @param r Object. * @param maxDist Maximum distance to any descendant. * @param singletons Singletons. */ public Node(DBIDRef r, double maxDist, DoubleDBIDList singletons) { assert(!singletons.contains(r)); this.singletons = DBIDUtil.newArray(singletons.size() + 1); this.singletons.add(r); this.singletons.addDBIDs(singletons); this.children = null; this.maxDist = maxDist; } /** * True, if the node is a leaf. * * @return {@code true}, if this is a leaf node. */ public boolean isLeaf() { return children == null || children.size() == 0; } } @Override public void initialize() { bulkLoad(relation.getDBIDs()); if(LOG.isVerbose()) { int[] counts = new int[5]; checkCoverTree(root, counts, 0); LOG.statistics(new LongStatistic(this.getClass().getName() + ".nodes", counts[0])); LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".avg-depth", counts[1] / (double) counts[0])); LOG.statistics(new LongStatistic(this.getClass().getName() + ".max-depth", counts[2])); LOG.statistics(new LongStatistic(this.getClass().getName() + ".singletons", counts[3])); LOG.statistics(new LongStatistic(this.getClass().getName() + ".entries", counts[4])); } } /** * Bulk-load the index. * * @param ids IDs to load */ public void bulkLoad(DBIDs ids) { if(ids.size() == 0) { return; } assert(root == null) : "Tree already initialized."; DBIDIter it = ids.iter(); DBID first = DBIDUtil.deref(it); // Compute distances to all neighbors: ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(ids.size() - 1); for(it.advance(); it.valid(); it.advance()) { candidates.add(distance(first, it), it); } root = bulkConstruct(first, Integer.MAX_VALUE, candidates); } /** * Bulk-load the cover tree. * * This bulk-load is slightly simpler than the one used in the original * cover-tree source: We do not look back into the "far" set of candidates. * * @param cur Current routing object * @param maxScale Maximum scale * @param elems Candidates * @return Root node of subtree */ protected Node bulkConstruct(DBIDRef cur, int maxScale, ModifiableDoubleDBIDList elems) { assert(!elems.contains(cur)); final double max = maxDistance(elems); final int scale = Math.min(distToScale(max) - 1, maxScale); final int nextScale = scale - 1; // Leaf node, because points coincide, we are too deep, or have too few // elements remaining: if(max <= 0 || scale <= scaleBottom || elems.size() < truncate) { return new Node(cur, max, elems); } // Find neighbors in the cover of the current object: ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(); excludeNotCovered(elems, scaleToDist(scale), candidates); // If no elements were not in the cover, build a compact tree: if(candidates.size() == 0) { LOG.warning("Scale not chosen appropriately? " + max + " " + scaleToDist(scale)); return bulkConstruct(cur, nextScale, elems); } // We will have at least one other child, so build the parent: Node node = new Node(cur, max); // Routing element now is a singleton: final boolean curSingleton = elems.size() == 0; if(!curSingleton) { // Add node for the routing object: node.children.add(bulkConstruct(cur, nextScale, elems)); } final double fmax = scaleToDist(nextScale); // Build additional cover nodes: for(DoubleDBIDListIter it = candidates.iter(); it.valid();) { assert(it.getOffset() == 0); DBID t = DBIDUtil.deref(it); elems.clear(); // Recycle. collectByCover(it, candidates, fmax, elems); assert(DBIDUtil.equal(t, it)) : "First element in candidates must not change!"; if(elems.size() == 0) { // Singleton node.singletons.add(it); } else { // Build a full child node: node.children.add(bulkConstruct(it, nextScale, elems)); } candidates.removeSwap(0); } assert(candidates.size() == 0); // Routing object is not yet handled: if(curSingleton) { if(node.isLeaf()) { node.children = null; // First in leaf is enough. } else { node.singletons.add(cur); // Add as regular singleton. } } // TODO: improve recycling of lists? return node; } /** * Collect some statistics on the tree. * * @param cur Current node * @param counts Counter set * @param depth Current depth */ private void checkCoverTree(Node cur, int[] counts, int depth) { counts[0] += 1; // Node count counts[1] += depth; // Sum of depth counts[2] = depth > counts[2] ? depth : counts[2]; // Max depth counts[3] += cur.singletons.size() - 1; counts[4] += cur.singletons.size() - (cur.children == null ? 0 : 1); if(cur.children != null) { ++depth; for(Node chi : cur.children) { checkCoverTree(chi, counts, depth); } assert(cur.children.size() > 0) : "Empty childs list."; } } @Override public RangeQuery getRangeQuery(DistanceQuery distanceQuery, Object... hints) { // Query on the relation we index if(distanceQuery.getRelation() != relation) { return null; } DistanceFunction distanceFunction = (DistanceFunction) distanceQuery.getDistanceFunction(); if(!this.distanceFunction.equals(distanceFunction)) { LOG.debug("Distance function not supported by index - or 'equals' not implemented right!"); return null; } DistanceQuery dq = distanceFunction.instantiate(relation); return new CoverTreeRangeQuery(dq); } @Override public KNNQuery getKNNQuery(DistanceQuery distanceQuery, Object... hints) { // Query on the relation we index if(distanceQuery.getRelation() != relation) { return null; } DistanceFunction distanceFunction = (DistanceFunction) distanceQuery.getDistanceFunction(); if(!this.distanceFunction.equals(distanceFunction)) { return null; } DistanceQuery dq = distanceFunction.instantiate(relation); return new CoverTreeKNNQuery(dq); } @Override protected Logging getLogger() { return LOG; } /** * Range query class. * * @author Erich Schubert */ public class CoverTreeRangeQuery extends AbstractDistanceRangeQueryimplements RangeQuery { /** * Constructor. * * @param distanceQuery Distance query */ public CoverTreeRangeQuery(DistanceQuery distanceQuery) { super(distanceQuery); } @Override public void getRangeForObject(O obj, double range, ModifiableDoubleDBIDList ret) { ArrayList open = new ArrayList(); // LIFO stack open.add(root); DBIDVar r = DBIDUtil.newVar(); while(!open.isEmpty()) { final Node cur = open.remove(open.size() - 1); // pop() cur.singletons.assignVar(0, r); final double d = distance(obj, r); // Covered area not in range (metric assumption!): if(d - cur.maxDist > range) { continue; } if(!cur.isLeaf()) { // Inner node: for(int i = 0, l = cur.children.size(); i < l; i++) { open.add(cur.children.get(i)); } } else { // Leaf node // Consider routing object, too: if(d <= range) { ret.add(d, r); // First element is a candidate now } } // For remaining singletons, compute the distances: for(int i = 1, l = cur.singletons.size(); i < l; i++) { cur.singletons.assignVar(i, r); final double d2 = distance(obj, r); if(d2 <= range) { ret.add(d2, r); } } } } } /** * KNN Query class. * * @author Erich Schubert */ public class CoverTreeKNNQuery extends AbstractDistanceKNNQueryimplements KNNQuery { /** * Constructor. * * @param distanceQuery Distance */ public CoverTreeKNNQuery(DistanceQuery distanceQuery) { super(distanceQuery); } @Override public KNNList getKNNForObject(O obj, int k) { if(k < 1) { throw new IllegalArgumentException("At least one object has to be requested!"); } KNNHeap knnList = DBIDUtil.newHeap(k); double d_k = Double.POSITIVE_INFINITY; final DoubleObjectMinHeap pq = new DoubleObjectMinHeap<>(); // Push the root node final double rootdist = distance(obj, root.singletons.iter()); pq.add(rootdist - root.maxDist, root); // search in tree while(!pq.isEmpty()) { final Node cur = pq.peekValue(); final double prio = pq.peekKey(); // Minimum distance to cover final double d = prio + cur.maxDist; // Restore distance to center. pq.poll(); // Remove if(knnList.size() >= k && prio > d_k) { continue; } final DBIDIter it = cur.singletons.iter(); if(!cur.isLeaf()) { // Inner node: for(Node c : cur.children) { final DBIDIter f = c.singletons.iter(); final double dist = DBIDUtil.equal(f, it) ? d : distance(obj, f); final double newprio = dist - c.maxDist; // Minimum distance if(newprio <= d_k) { pq.add(newprio, c); } } } else { // Leaf node // Consider routing object, too: if(d <= d_k) { d_k = knnList.insert(d, it); // First element is a candidate now } } it.advance(); // Skip routing object. // For remaining singletons, compute the distances: while(it.valid()) { final double d2 = distance(obj, it); if(d2 <= d_k) { d_k = knnList.insert(d2, it); } it.advance(); } } return knnList.toKNNList(); } } /** * Index factory. * * @author Erich Schubert * * @apiviz.has SimplifiedCoverTree * * @param Object type */ public static class Factory extends AbstractCoverTree.Factory> { /** * Constructor. * * @param distanceFunction Distance function * @param expansion Expansion rate * @param truncate Truncate branches with less than this number of * instances. */ public Factory(DistanceFunction distanceFunction, double expansion, int truncate) { super(distanceFunction, expansion, truncate); } @Override public SimplifiedCoverTree instantiate(Relation relation) { return new SimplifiedCoverTree(relation, distanceFunction, expansion, truncate); } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractCoverTree.Factory.Parameterizer { @Override protected SimplifiedCoverTree.Factory makeInstance() { return new SimplifiedCoverTree.Factory<>(distanceFunction, expansion, truncate); } } } }