package de.lmu.ifi.dbs.elki.index.invertedlist;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2015
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.ArrayList;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.KNNHeap;
import de.lmu.ifi.dbs.elki.database.ids.KNNList;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.AbstractDistanceKNNQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
import de.lmu.ifi.dbs.elki.database.query.range.AbstractDistanceRangeQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.ArcCosineDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.CosineDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.index.AbstractIndex;
import de.lmu.ifi.dbs.elki.index.IndexFactory;
import de.lmu.ifi.dbs.elki.index.KNNIndex;
import de.lmu.ifi.dbs.elki.index.RangeIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
/**
* Simple index using inverted lists.
*
* @author Erich Schubert
*
* @param Vector type
*/
public class InMemoryInvertedIndex extends AbstractIndex implements KNNIndex, RangeIndex {
/**
* Class logger.
*/
private static final Logging LOG = Logging.getLogger(InMemoryInvertedIndex.class);
/**
* Inverted index.
*/
ArrayList index;
/**
* Length storage.
*/
WritableDoubleDataStore length;
/**
* Constructor.
*
* @param relation Data.
*/
public InMemoryInvertedIndex(Relation relation) {
super(relation);
}
@Override
public void initialize() {
if(index != null) {
LOG.warning("Index was already initialized!");
}
index = new ArrayList<>();
length = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB);
for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
V obj = relation.get(iter);
if(obj instanceof SparseNumberVector) {
indexSparse(iter, (SparseNumberVector) obj);
}
else {
indexDense(iter, obj);
}
}
// Sort indexes
long count = 0L;
for(ModifiableDoubleDBIDList column : index) {
column.sort();
count += column.size();
}
double sparsity = count / (index.size() * (double) relation.size());
if(sparsity > .2) {
LOG.warning("Inverted list indexes only perform well for very sparse data. Your data set has a sparsity of " + sparsity);
}
}
/**
* Index a single (sparse) instance.
*
* @param ref Object reference
* @param obj Object to index.
*/
private void indexSparse(DBIDRef ref, SparseNumberVector obj) {
double len = 0.;
for(int iter = obj.iter(); obj.iterValid(iter); iter = obj.iterAdvance(iter)) {
final int dim = obj.iterDim(iter);
final double val = obj.iterDoubleValue(iter);
if(val == 0. || val != val) {
continue;
}
len += val * val;
getOrCreateColumn(dim).add(val, ref);
}
length.put(ref, len);
}
/**
* Index a single (dense) instance.
*
* @param ref Object reference
* @param obj Object to index.
*/
private void indexDense(DBIDRef ref, V obj) {
double len = 0.;
for(int dim = 0, max = obj.getDimensionality(); dim < max; dim++) {
final double val = obj.doubleValue(dim);
if(val == 0. || val != val) {
continue;
}
len += val * val;
getOrCreateColumn(dim).add(val, ref);
}
length.put(ref, Math.sqrt(len));
}
/**
* Get (or create) a column.
*
* @param dim Dimension
* @return Column
*/
private ModifiableDoubleDBIDList getOrCreateColumn(int dim) {
while(dim >= index.size()) {
index.add(DBIDUtil.newDistanceDBIDList());
}
return index.get(dim);
}
/**
* Query the most similar objects, sparse version.
*
* @param obj Query object
* @param scores Score storage
* @param cands Non-zero objects set
* @return Result
*/
private double naiveQuerySparse(SparseNumberVector obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) {
double len = 0.; // Length of query object, for final normalization
for(int iter = obj.iter(); obj.iterValid(iter); iter = obj.iterAdvance(iter)) {
final int dim = obj.iterDim(iter);
final double val = obj.iterDoubleValue(iter);
if(val == 0. || val != val) {
continue;
}
len += val * val;
// No matching documents in index:
if(dim >= index.size()) {
continue;
}
ModifiableDoubleDBIDList column = index.get(dim);
for(DoubleDBIDListIter n = column.iter(); n.valid(); n.advance()) {
scores.increment(n, n.doubleValue() * val);
cands.add(n);
}
}
return Math.sqrt(len);
}
/**
* Query the most similar objects, dense version.
*
* @param obj Query object
* @param scores Score storage
* @param cands Non-zero objects set
* @return Result
*/
private double naiveQueryDense(NumberVector obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) {
double len = 0.; // Length of query object, for final normalization
for(int dim = 0, max = obj.getDimensionality(); dim < max; dim++) {
final double val = obj.doubleValue(dim);
if(val == 0. || val != val) {
continue;
}
len += val * val;
// No matching documents in index:
if(dim >= index.size()) {
continue;
}
ModifiableDoubleDBIDList column = index.get(dim);
for(DoubleDBIDListIter n = column.iter(); n.valid(); n.advance()) {
scores.increment(n, n.doubleValue() * val);
cands.add(n);
}
}
return Math.sqrt(len);
}
/**
* Query the most similar objects, abstract version.
*
* @param obj Query object
* @param scores Score storage (must be initialized with zeros!)
* @param cands Non-zero objects set (must be empty)
* @return Result
*/
private double naiveQuery(V obj, WritableDoubleDataStore scores, HashSetModifiableDBIDs cands) {
if(obj instanceof SparseNumberVector) {
return naiveQuerySparse((SparseNumberVector) obj, scores, cands);
}
else {
return naiveQueryDense(obj, scores, cands);
}
}
@Override
public void logStatistics() {
long count = 0L;
for(ModifiableDoubleDBIDList column : index) {
count += column.size();
}
double sparsity = count / (index.size() * (double) relation.size());
LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".sparsity", sparsity));
}
@Override
public KNNQuery getKNNQuery(DistanceQuery distanceQuery, Object... hints) {
DistanceFunction super V> df = distanceQuery.getDistanceFunction();
if(df instanceof CosineDistanceFunction) {
return new CosineKNNQuery(distanceQuery);
}
if(df instanceof ArcCosineDistanceFunction) {
return new ArcCosineKNNQuery(distanceQuery);
}
return null;
}
@Override
public RangeQuery getRangeQuery(DistanceQuery distanceQuery, Object... hints) {
DistanceFunction super V> df = distanceQuery.getDistanceFunction();
if(df instanceof CosineDistanceFunction) {
return new CosineRangeQuery(distanceQuery);
}
if(df instanceof ArcCosineDistanceFunction) {
return new ArcCosineRangeQuery(distanceQuery);
}
return null;
}
@Override
public String getLongName() {
return "Inverted lists index";
}
@Override
public String getShortName() {
return "inverted-lists";
}
/**
* kNN query object, for cosine distance.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected class CosineKNNQuery extends AbstractDistanceKNNQuery {
/**
* Constructor.
*
* @param distanceQuery Distance query
*/
public CosineKNNQuery(DistanceQuery distanceQuery) {
super(distanceQuery);
}
@Override
public KNNList getKNNForObject(V obj, int k) {
HashSetModifiableDBIDs cands = DBIDUtil.newHashSet();
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, //
DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
double len = naiveQuery(obj, scores, cands);
// TODO: delay the division by len!
KNNHeap heap = DBIDUtil.newHeap(k);
for(DBIDIter n = cands.iter(); n.valid(); n.advance()) {
double dist = 1. - scores.doubleValue(n) / (length.doubleValue(n) * len);
if(heap.getKNNDistance() >= dist) {
heap.insert(dist, n);
}
}
return heap.toKNNList();
}
}
/**
* kNN query object, for arc cosine distance.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected class ArcCosineKNNQuery extends AbstractDistanceKNNQuery {
/**
* Constructor.
*
* @param distanceQuery Distance query
*/
public ArcCosineKNNQuery(DistanceQuery distanceQuery) {
super(distanceQuery);
}
@Override
public KNNList getKNNForObject(V obj, int k) {
HashSetModifiableDBIDs cands = DBIDUtil.newHashSet();
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, //
DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
double len = naiveQuery(obj, scores, cands);
// TODO: delay the division by len and acos!
KNNHeap heap = DBIDUtil.newHeap(k);
for(DBIDIter n = cands.iter(); n.valid(); n.advance()) {
double dist = Math.acos(scores.doubleValue(n) / (length.doubleValue(n) * len));
if(heap.getKNNDistance() >= dist) {
heap.insert(dist, n);
}
}
return heap.toKNNList();
}
}
/**
* kNN query object, for cosine distance.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected class CosineRangeQuery extends AbstractDistanceRangeQuery {
/**
* Constructor.
*
* @param distanceQuery Distance query
*/
public CosineRangeQuery(DistanceQuery distanceQuery) {
super(distanceQuery);
}
@Override
public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) {
HashSetModifiableDBIDs cands = DBIDUtil.newHashSet();
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, //
DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
double len = naiveQuery(obj, scores, cands);
// dist = 1 - sim/len <-> sim = len * (1-dist)
double simrange = (1. - range) * len;
for(DBIDIter n = cands.iter(); n.valid(); n.advance()) {
double sim = scores.doubleValue(n) / length.doubleValue(n);
if(sim >= simrange) {
result.add(1. - sim / len, n);
}
}
}
}
/**
* kNN query object, for cosine distance.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
protected class ArcCosineRangeQuery extends AbstractDistanceRangeQuery {
/**
* Constructor.
*
* @param distanceQuery Distance query
*/
public ArcCosineRangeQuery(DistanceQuery distanceQuery) {
super(distanceQuery);
}
@Override
public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) {
HashSetModifiableDBIDs cands = DBIDUtil.newHashSet();
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(cands, //
DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
double len = naiveQuery(obj, scores, cands);
// dist = acos(sim/len) <-> sim = cos(dist)*len
double simrange = Math.cos(range) * len;
for(DBIDIter n = cands.iter(); n.valid(); n.advance()) {
double sim = scores.doubleValue(n) / length.doubleValue(n);
if(sim >= simrange) {
result.add(Math.acos(sim / len), n);
}
}
}
}
/**
* Index factory
*
* @author Erich Schubert
*
* @apiviz.has InMemoryInvertedIndex
*
* @param Vector type
*/
public static class Factory implements IndexFactory> {
@Override
public InMemoryInvertedIndex instantiate(Relation relation) {
return new InMemoryInvertedIndex<>(relation);
}
@Override
public TypeInformation getInputTypeRestriction() {
return TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH;
}
/**
* Parameterizer for inverted list index.
*
* @author Erich Schubert
*
* @apiviz.exclude
*
* @param Vector type
*/
public static class Parameterizer extends AbstractParameterizer {
@Override
protected Factory makeInstance() {
return new Factory<>();
}
}
}
}