diff options
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/index/invertedlist/InMemoryInvertedIndex.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/index/invertedlist/InMemoryInvertedIndex.java | 467 |
1 files changed, 467 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/index/invertedlist/InMemoryInvertedIndex.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/index/invertedlist/InMemoryInvertedIndex.java new file mode 100644 index 00000000..7a3087ba --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/index/invertedlist/InMemoryInvertedIndex.java @@ -0,0 +1,467 @@ +package de.lmu.ifi.dbs.elki.index.invertedlist; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.SparseNumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.KNNHeap; +import de.lmu.ifi.dbs.elki.database.ids.KNNList; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.AbstractDistanceKNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.range.AbstractDistanceRangeQuery; +import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.ArcCosineDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.CosineDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.index.AbstractIndex; +import de.lmu.ifi.dbs.elki.index.IndexFactory; +import de.lmu.ifi.dbs.elki.index.KNNIndex; +import de.lmu.ifi.dbs.elki.index.RangeIndex; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Simple index using inverted lists. + * + * @author Erich Schubert + * + * @param <V> Vector type + */ +public class InMemoryInvertedIndex<V extends NumberVector> extends AbstractIndex<V> implements KNNIndex<V>, RangeIndex<V> { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(InMemoryInvertedIndex.class); + + /** + * Inverted index. + */ + ArrayList<ModifiableDoubleDBIDList> index; + + /** + * Length storage. + */ + WritableDoubleDataStore length; + + /** + * Constructor. + * + * @param relation Data. + */ + public InMemoryInvertedIndex(Relation<V> relation) { + super(relation); + } + + @Override + public void initialize() { + if(index != null) { + LOG.warning("Index was already initialized!"); + } + index = new ArrayList<>(); + length = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB); + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + V obj = relation.get(iter); + if(obj instanceof SparseNumberVector) { + indexSparse(iter, (SparseNumberVector) obj); + } + else { + indexDense(iter, obj); + } + } + // Sort indexes + long count = 0L; + for(ModifiableDoubleDBIDList column : index) { + column.sort(); + count += column.size(); + } + double sparsity = count / (index.size() * (double) relation.size()); + if(sparsity > .2) { + LOG.warning("Inverted list indexes only perform well for very sparse data. Your data set has a sparsity of " + sparsity); + } + } + + /** + * Index a single (sparse) instance. + * + * @param ref Object reference + * @param obj Object to index. + */ + private void indexSparse(DBIDRef ref, SparseNumberVector obj) { + double len = 0.; + for(int iter = obj.iter(); obj.iterValid(iter); iter = obj.iterAdvance(iter)) { + final int dim = obj.iterDim(iter); + final double val = obj.iterDoubleValue(iter); + if(val == 0. || val != val) { + continue; + } + len += val * val; + getOrCreateColumn(dim).add(val, ref); + } + length.put(ref, len); + } + + /** + * Index a single (dense) instance. + * + * @param ref Object reference + * @param obj Object to index. + */ + private void indexDense(DBIDRef ref, V obj) { + double len = 0.; + for(int dim = 0, max = obj.getDimensionality(); dim < max; dim++) { + final double val = obj.doubleValue(dim); + if(val == 0. || val != val) { + continue; + } + len += val * val; + getOrCreateColumn(dim).add(val, ref); + } + length.put(ref, Math.sqrt(len)); + } + + /** + * Get (or create) a column. + * + * @param dim Dimension + * @return Column + */ + private ModifiableDoubleDBIDList getOrCreateColumn(int dim) { + while(dim >= index.size()) { + index.add(DBIDUtil.newDistanceDBIDList()); + } + return index.get(dim); + } + + /** + * Query the most similar objects, sparse version. + * + * @param obj Query object + * @param scores Score storage + * @param cands Non-zero objects set + * @return Result + */ + private double naiveQuerySparse(SparseNumberVector obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) { + double len = 0.; // Length of query object, for final normalization + for(int iter = obj.iter(); obj.iterValid(iter); iter = obj.iterAdvance(iter)) { + final int dim = obj.iterDim(iter); + final double val = obj.iterDoubleValue(iter); + if(val == 0. || val != val) { + continue; + } + len += val * val; + // No matching documents in index: + if(dim >= index.size()) { + continue; + } + ModifiableDoubleDBIDList column = index.get(dim); + for(DoubleDBIDListIter n = column.iter(); n.valid(); n.advance()) { + scores.increment(n, n.doubleValue() * val); + cands.add(n); + } + } + return Math.sqrt(len); + } + + /** + * Query the most similar objects, dense version. + * + * @param obj Query object + * @param scores Score storage + * @param cands Non-zero objects set + * @return Result + */ + private double naiveQueryDense(NumberVector obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) { + double len = 0.; // Length of query object, for final normalization + for(int dim = 0, max = obj.getDimensionality(); dim < max; dim++) { + final double val = obj.doubleValue(dim); + if(val == 0. || val != val) { + continue; + } + len += val * val; + // No matching documents in index: + if(dim >= index.size()) { + continue; + } + ModifiableDoubleDBIDList column = index.get(dim); + for(DoubleDBIDListIter n = column.iter(); n.valid(); n.advance()) { + scores.increment(n, n.doubleValue() * val); + cands.add(n); + } + } + return Math.sqrt(len); + } + + /** + * Query the most similar objects, abstract version. + * + * @param obj Query object + * @param scores Score storage (must be initialized with zeros!) + * @param cands Non-zero objects set (must be empty) + * @return Result + */ + private double naiveQuery(V obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) { + if(obj instanceof SparseNumberVector) { + return naiveQuerySparse((SparseNumberVector) obj, scores, cands); + } + else { + return naiveQueryDense(obj, scores, cands); + } + } + + @Override + public void logStatistics() { + long count = 0L; + for(ModifiableDoubleDBIDList column : index) { + count += column.size(); + } + double sparsity = count / (index.size() * (double) relation.size()); + LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".sparsity", sparsity)); + } + + @Override + public KNNQuery<V> getKNNQuery(DistanceQuery<V> distanceQuery, Object... hints) { + DistanceFunction<? super V> df = distanceQuery.getDistanceFunction(); + if(df instanceof CosineDistanceFunction) { + return new CosineKNNQuery(distanceQuery); + } + if(df instanceof ArcCosineDistanceFunction) { + return new ArcCosineKNNQuery(distanceQuery); + } + return null; + } + + @Override + public RangeQuery<V> getRangeQuery(DistanceQuery<V> distanceQuery, Object... hints) { + DistanceFunction<? super V> df = distanceQuery.getDistanceFunction(); + if(df instanceof CosineDistanceFunction) { + return new CosineRangeQuery(distanceQuery); + } + if(df instanceof ArcCosineDistanceFunction) { + return new ArcCosineRangeQuery(distanceQuery); + } + return null; + } + + @Override + public String getLongName() { + return "Inverted lists index"; + } + + @Override + public String getShortName() { + return "inverted-lists"; + } + + /** + * kNN query object, for cosine distance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected class CosineKNNQuery extends AbstractDistanceKNNQuery<V> { + /** + * Constructor. + * + * @param distanceQuery Distance query + */ + public CosineKNNQuery(DistanceQuery<V> distanceQuery) { + super(distanceQuery); + } + + @Override + public KNNList getKNNForObject(V obj, int k) { + HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // + DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); + double len = naiveQuery(obj, scores, cands); + // TODO: delay the division by len! + KNNHeap heap = DBIDUtil.newHeap(k); + for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { + double dist = 1. - scores.doubleValue(n) / (length.doubleValue(n) * len); + if(heap.getKNNDistance() >= dist) { + heap.insert(dist, n); + } + } + + return heap.toKNNList(); + } + } + + /** + * kNN query object, for arc cosine distance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected class ArcCosineKNNQuery extends AbstractDistanceKNNQuery<V> { + /** + * Constructor. + * + * @param distanceQuery Distance query + */ + public ArcCosineKNNQuery(DistanceQuery<V> distanceQuery) { + super(distanceQuery); + } + + @Override + public KNNList getKNNForObject(V obj, int k) { + HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // + DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); + double len = naiveQuery(obj, scores, cands); + // TODO: delay the division by len and acos! + KNNHeap heap = DBIDUtil.newHeap(k); + for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { + double dist = Math.acos(scores.doubleValue(n) / (length.doubleValue(n) * len)); + if(heap.getKNNDistance() >= dist) { + heap.insert(dist, n); + } + } + + return heap.toKNNList(); + } + } + + /** + * kNN query object, for cosine distance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected class CosineRangeQuery extends AbstractDistanceRangeQuery<V> { + /** + * Constructor. + * + * @param distanceQuery Distance query + */ + public CosineRangeQuery(DistanceQuery<V> distanceQuery) { + super(distanceQuery); + } + + @Override + public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) { + HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // + DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); + double len = naiveQuery(obj, scores, cands); + // dist = 1 - sim/len <-> sim = len * (1-dist) + double simrange = (1. - range) * len; + for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { + double sim = scores.doubleValue(n) / length.doubleValue(n); + if(sim >= simrange) { + result.add(1. - sim / len, n); + } + } + } + } + + /** + * kNN query object, for cosine distance. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected class ArcCosineRangeQuery extends AbstractDistanceRangeQuery<V> { + /** + * Constructor. + * + * @param distanceQuery Distance query + */ + public ArcCosineRangeQuery(DistanceQuery<V> distanceQuery) { + super(distanceQuery); + } + + @Override + public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) { + HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); + WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // + DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); + double len = naiveQuery(obj, scores, cands); + // dist = acos(sim/len) <-> sim = cos(dist)*len + double simrange = Math.cos(range) * len; + for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { + double sim = scores.doubleValue(n) / length.doubleValue(n); + if(sim >= simrange) { + result.add(Math.acos(sim / len), n); + } + } + } + } + + /** + * Index factory + * + * @author Erich Schubert + * + * @apiviz.has InMemoryInvertedIndex + * + * @param <V> Vector type + */ + public static class Factory<V extends NumberVector> implements IndexFactory<V, InMemoryInvertedIndex<V>> { + @Override + public InMemoryInvertedIndex<V> instantiate(Relation<V> relation) { + return new InMemoryInvertedIndex<>(relation); + } + + @Override + public TypeInformation getInputTypeRestriction() { + return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH; + } + + /** + * Parameterizer for inverted list index. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + @Override + protected Factory<V> makeInstance() { + return new Factory<>(); + } + } + } +} |