package de.lmu.ifi.dbs.elki.index.invertedlist; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.SparseNumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.KNNHeap; import de.lmu.ifi.dbs.elki.database.ids.KNNList; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.knn.AbstractDistanceKNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.range.AbstractDistanceRangeQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.ArcCosineDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.CosineDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.index.AbstractIndex; import de.lmu.ifi.dbs.elki.index.IndexFactory; import de.lmu.ifi.dbs.elki.index.KNNIndex; import de.lmu.ifi.dbs.elki.index.RangeIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** * Simple index using inverted lists. * * @author Erich Schubert * * @param Vector type */ public class InMemoryInvertedIndex extends AbstractIndex implements KNNIndex, RangeIndex { /** * Class logger. */ private static final Logging LOG = Logging.getLogger(InMemoryInvertedIndex.class); /** * Inverted index. */ ArrayList index; /** * Length storage. */ WritableDoubleDataStore length; /** * Constructor. * * @param relation Data. */ public InMemoryInvertedIndex(Relation relation) { super(relation); } @Override public void initialize() { if(index != null) { LOG.warning("Index was already initialized!"); } index = new ArrayList<>(); length = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB); for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { V obj = relation.get(iter); if(obj instanceof SparseNumberVector) { indexSparse(iter, (SparseNumberVector) obj); } else { indexDense(iter, obj); } } // Sort indexes long count = 0L; for(ModifiableDoubleDBIDList column : index) { column.sort(); count += column.size(); } double sparsity = count / (index.size() * (double) relation.size()); if(sparsity > .2) { LOG.warning("Inverted list indexes only perform well for very sparse data. Your data set has a sparsity of " + sparsity); } } /** * Index a single (sparse) instance. * * @param ref Object reference * @param obj Object to index. */ private void indexSparse(DBIDRef ref, SparseNumberVector obj) { double len = 0.; for(int iter = obj.iter(); obj.iterValid(iter); iter = obj.iterAdvance(iter)) { final int dim = obj.iterDim(iter); final double val = obj.iterDoubleValue(iter); if(val == 0. || val != val) { continue; } len += val * val; getOrCreateColumn(dim).add(val, ref); } length.put(ref, len); } /** * Index a single (dense) instance. * * @param ref Object reference * @param obj Object to index. */ private void indexDense(DBIDRef ref, V obj) { double len = 0.; for(int dim = 0, max = obj.getDimensionality(); dim < max; dim++) { final double val = obj.doubleValue(dim); if(val == 0. || val != val) { continue; } len += val * val; getOrCreateColumn(dim).add(val, ref); } length.put(ref, Math.sqrt(len)); } /** * Get (or create) a column. * * @param dim Dimension * @return Column */ private ModifiableDoubleDBIDList getOrCreateColumn(int dim) { while(dim >= index.size()) { index.add(DBIDUtil.newDistanceDBIDList()); } return index.get(dim); } /** * Query the most similar objects, sparse version. * * @param obj Query object * @param scores Score storage * @param cands Non-zero objects set * @return Result */ private double naiveQuerySparse(SparseNumberVector obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) { double len = 0.; // Length of query object, for final normalization for(int iter = obj.iter(); obj.iterValid(iter); iter = obj.iterAdvance(iter)) { final int dim = obj.iterDim(iter); final double val = obj.iterDoubleValue(iter); if(val == 0. || val != val) { continue; } len += val * val; // No matching documents in index: if(dim >= index.size()) { continue; } ModifiableDoubleDBIDList column = index.get(dim); for(DoubleDBIDListIter n = column.iter(); n.valid(); n.advance()) { scores.increment(n, n.doubleValue() * val); cands.add(n); } } return Math.sqrt(len); } /** * Query the most similar objects, dense version. * * @param obj Query object * @param scores Score storage * @param cands Non-zero objects set * @return Result */ private double naiveQueryDense(NumberVector obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) { double len = 0.; // Length of query object, for final normalization for(int dim = 0, max = obj.getDimensionality(); dim < max; dim++) { final double val = obj.doubleValue(dim); if(val == 0. || val != val) { continue; } len += val * val; // No matching documents in index: if(dim >= index.size()) { continue; } ModifiableDoubleDBIDList column = index.get(dim); for(DoubleDBIDListIter n = column.iter(); n.valid(); n.advance()) { scores.increment(n, n.doubleValue() * val); cands.add(n); } } return Math.sqrt(len); } /** * Query the most similar objects, abstract version. * * @param obj Query object * @param scores Score storage (must be initialized with zeros!) * @param cands Non-zero objects set (must be empty) * @return Result */ private double naiveQuery(V obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) { if(obj instanceof SparseNumberVector) { return naiveQuerySparse((SparseNumberVector) obj, scores, cands); } else { return naiveQueryDense(obj, scores, cands); } } @Override public void logStatistics() { long count = 0L; for(ModifiableDoubleDBIDList column : index) { count += column.size(); } double sparsity = count / (index.size() * (double) relation.size()); LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".sparsity", sparsity)); } @Override public KNNQuery getKNNQuery(DistanceQuery distanceQuery, Object... hints) { DistanceFunction df = distanceQuery.getDistanceFunction(); if(df instanceof CosineDistanceFunction) { return new CosineKNNQuery(distanceQuery); } if(df instanceof ArcCosineDistanceFunction) { return new ArcCosineKNNQuery(distanceQuery); } return null; } @Override public RangeQuery getRangeQuery(DistanceQuery distanceQuery, Object... hints) { DistanceFunction df = distanceQuery.getDistanceFunction(); if(df instanceof CosineDistanceFunction) { return new CosineRangeQuery(distanceQuery); } if(df instanceof ArcCosineDistanceFunction) { return new ArcCosineRangeQuery(distanceQuery); } return null; } @Override public String getLongName() { return "Inverted lists index"; } @Override public String getShortName() { return "inverted-lists"; } /** * kNN query object, for cosine distance. * * @author Erich Schubert * * @apiviz.exclude */ protected class CosineKNNQuery extends AbstractDistanceKNNQuery { /** * Constructor. * * @param distanceQuery Distance query */ public CosineKNNQuery(DistanceQuery distanceQuery) { super(distanceQuery); } @Override public KNNList getKNNForObject(V obj, int k) { HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); double len = naiveQuery(obj, scores, cands); // TODO: delay the division by len! KNNHeap heap = DBIDUtil.newHeap(k); for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { double dist = 1. - scores.doubleValue(n) / (length.doubleValue(n) * len); if(heap.getKNNDistance() >= dist) { heap.insert(dist, n); } } return heap.toKNNList(); } } /** * kNN query object, for arc cosine distance. * * @author Erich Schubert * * @apiviz.exclude */ protected class ArcCosineKNNQuery extends AbstractDistanceKNNQuery { /** * Constructor. * * @param distanceQuery Distance query */ public ArcCosineKNNQuery(DistanceQuery distanceQuery) { super(distanceQuery); } @Override public KNNList getKNNForObject(V obj, int k) { HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); double len = naiveQuery(obj, scores, cands); // TODO: delay the division by len and acos! KNNHeap heap = DBIDUtil.newHeap(k); for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { double dist = Math.acos(scores.doubleValue(n) / (length.doubleValue(n) * len)); if(heap.getKNNDistance() >= dist) { heap.insert(dist, n); } } return heap.toKNNList(); } } /** * kNN query object, for cosine distance. * * @author Erich Schubert * * @apiviz.exclude */ protected class CosineRangeQuery extends AbstractDistanceRangeQuery { /** * Constructor. * * @param distanceQuery Distance query */ public CosineRangeQuery(DistanceQuery distanceQuery) { super(distanceQuery); } @Override public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) { HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); double len = naiveQuery(obj, scores, cands); // dist = 1 - sim/len <-> sim = len * (1-dist) double simrange = (1. - range) * len; for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { double sim = scores.doubleValue(n) / length.doubleValue(n); if(sim >= simrange) { result.add(1. - sim / len, n); } } } } /** * kNN query object, for cosine distance. * * @author Erich Schubert * * @apiviz.exclude */ protected class ArcCosineRangeQuery extends AbstractDistanceRangeQuery { /** * Constructor. * * @param distanceQuery Distance query */ public ArcCosineRangeQuery(DistanceQuery distanceQuery) { super(distanceQuery); } @Override public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) { HashSetModifiableDBIDs cands = DBIDUtil.newHashSet(); WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, // DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.); double len = naiveQuery(obj, scores, cands); // dist = acos(sim/len) <-> sim = cos(dist)*len double simrange = Math.cos(range) * len; for(DBIDIter n = cands.iter(); n.valid(); n.advance()) { double sim = scores.doubleValue(n) / length.doubleValue(n); if(sim >= simrange) { result.add(Math.acos(sim / len), n); } } } } /** * Index factory * * @author Erich Schubert * * @apiviz.has InMemoryInvertedIndex * * @param Vector type */ public static class Factory implements IndexFactory> { @Override public InMemoryInvertedIndex instantiate(Relation relation) { return new InMemoryInvertedIndex<>(relation); } @Override public TypeInformation getInputTypeRestriction() { return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH; } /** * Parameterizer for inverted list index. * * @author Erich Schubert * * @apiviz.exclude * * @param Vector type */ public static class Parameterizer extends AbstractParameterizer { @Override protected Factory makeInstance() { return new Factory<>(); } } } }