summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java331
1 files changed, 222 insertions, 109 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java
index e76c6034..8d371d4c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,9 +23,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.Arrays;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
@@ -37,15 +35,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
-import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
@@ -54,23 +52,24 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.DoubleIntegerArrayQuickSort;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
-import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
/**
* Fast Outlier Detection Using the "Local Correlation Integral".
*
- * Exact implementation only, not aLOCI. See {@link ALOCI}
+ * Exact implementation only, not aLOCI. See {@link ALOCI}.
*
* Outlier detection using multiple epsilon neighborhoods.
*
+ * This implementation has O(n<sup>3</sup> log n) runtime complexity!
+ *
* Based on: S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos:
* LOCI: Fast Outlier Detection Using the Local Correlation Integral. In: Proc.
* 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003.
@@ -80,13 +79,12 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
* @apiviz.has RangeQuery
*
* @param <O> Object type
- * @param <D> Distance type
*/
@Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral")
@Description("Algorithm to compute outliers based on the Local Correlation Integral")
@Reference(authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802")
-@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI"})
-public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
+@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI" })
+public class LOCI<O> extends AbstractDistanceBasedAlgorithm<O, OutlierResult> implements OutlierAlgorithm {
/**
* The logger for this class.
*/
@@ -111,7 +109,7 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
/**
* Holds the value of {@link #RMAX_ID}.
*/
- private D rmax;
+ private double rmax;
/**
* Holds the value of {@link #NMIN_ID}.
@@ -131,7 +129,7 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
* @param nmin Minimum neighborhood size
* @param alpha Alpha value
*/
- public LOCI(DistanceFunction<? super O, D> distanceFunction, D rmax, int nmin, double alpha) {
+ public LOCI(DistanceFunction<? super O> distanceFunction, double rmax, int nmin, double alpha) {
super(distanceFunction);
this.rmax = rmax;
this.nmin = nmin;
@@ -146,96 +144,62 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
* @return Outlier result
*/
public OutlierResult run(Database database, Relation<O> relation) {
- DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
- RangeQuery<O, D> rangeQuery = database.getRangeQuery(distFunc);
+ DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
+ RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc);
+ DBIDs ids = relation.getDBIDs();
- FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", relation.size(), LOG) : null;
// LOCI preprocessing step
- WritableDataStore<ArrayList<DoubleIntPair>> interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, ArrayList.class);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
- // build list of critical distances
- ArrayList<DoubleIntPair> cdist = new ArrayList<>(neighbors.size() << 1);
- {
- for(int i = 0; i < neighbors.size(); i++) {
- DistanceDBIDPair<D> r = neighbors.get(i);
- if(i + 1 < neighbors.size() && r.getDistance().compareTo(neighbors.get(i + 1).getDistance()) == 0) {
- continue;
- }
- cdist.add(new DoubleIntPair(r.getDistance().doubleValue(), i));
- final double ri = r.getDistance().doubleValue() / alpha;
- if(ri <= rmax.doubleValue()) {
- cdist.add(new DoubleIntPair(ri, Integer.MIN_VALUE));
- }
- }
- }
- Collections.sort(cdist);
- // fill the gaps to have fast lookups of number of neighbors at a given
- // distance.
- int lastk = 0;
- for(DoubleIntPair c : cdist) {
- if(c.second == Integer.MIN_VALUE) {
- c.second = lastk;
- }
- else {
- lastk = c.second;
- }
- }
-
- interestingDistances.put(iditer, cdist);
- if(progressPreproc != null) {
- progressPreproc.incrementProcessed(LOG);
- }
- }
- if(progressPreproc != null) {
- progressPreproc.ensureCompleted(LOG);
- }
+ WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class);
+ precomputeInterestingRadii(ids, rangeQuery, interestingDistances);
// LOCI main step
FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null;
WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
DoubleMinMax minmax = new DoubleMinMax();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- final List<DoubleIntPair> cdist = interestingDistances.get(iditer);
- final double maxdist = cdist.get(cdist.size() - 1).first;
- final int maxneig = cdist.get(cdist.size() - 1).second;
+ // Shared instance, to save allocations.
+ MeanVariance mv_n_r_alpha = new MeanVariance();
+
+ for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ final DoubleIntArrayList cdist = interestingDistances.get(iditer);
+ final double maxdist = cdist.getDouble(cdist.size() - 1);
+ final int maxneig = cdist.getInt(cdist.size() - 1);
double maxmdefnorm = 0.0;
double maxnormr = 0;
if(maxneig >= nmin) {
- D range = distFunc.getDistanceFactory().fromDouble(maxdist);
// Compute the largest neighborhood we will need.
- DistanceDBIDList<D> maxneighbors = rangeQuery.getRangeForDBID(iditer, range);
- // TODO: Ensure the set is sorted. Should be a no-op with most indexes.
+ DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist);
+ // TODO: Ensure the result is sorted. This is currently implied.
+
// For any critical distance, compute the normalized MDEF score.
- for(DoubleIntPair c : cdist) {
+ for(int i = 0, size = cdist.size(); i < size; i++) {
// Only start when minimum size is fulfilled
- if (c.second < nmin) {
+ if(cdist.getInt(i) < nmin) {
continue;
}
- final double r = c.first;
+ final double r = cdist.getDouble(i);
final double alpha_r = alpha * r;
- // compute n(p_i, \alpha * r) from list (note: alpha_r is different from c!)
- final int n_alphar = elementsAtRadius(cdist, alpha_r);
+ // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!)
+ final int n_alphar = cdist.getInt(cdist.find(alpha_r));
// compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF}
- MeanVariance mv_n_r_alpha = new MeanVariance();
- // TODO: optimize for double distances
- for (DistanceDBIDListIter<D> neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ mv_n_r_alpha.reset();
+ for(DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) {
// Stop at radius r
- if(neighbor.getDistance().doubleValue() > r) {
+ if(neighbor.doubleValue() > r) {
break;
}
- int rn_alphar = elementsAtRadius(interestingDistances.get(neighbor), alpha_r);
+ DoubleIntArrayList cdist2 = interestingDistances.get(neighbor);
+ int rn_alphar = cdist2.getInt(cdist2.find(alpha_r));
mv_n_r_alpha.put(rn_alphar);
}
// We only use the average and standard deviation
final double nhat_r_alpha = mv_n_r_alpha.getMean();
final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev();
- // Redundant divisions removed.
- final double mdef = (nhat_r_alpha - n_alphar); // / nhat_r_alpha;
- final double sigmamdef = sigma_nhat_r_alpha; // / nhat_r_alpha;
+ // Redundant divisions by nhat_r_alpha removed.
+ final double mdef = nhat_r_alpha - n_alphar;
+ final double sigmamdef = sigma_nhat_r_alpha;
final double mdefnorm = mdef / sigmamdef;
if(mdefnorm > maxmdefnorm) {
@@ -246,46 +210,194 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
}
else {
// FIXME: when nmin was not fulfilled - what is the proper value then?
- maxmdefnorm = 1.0;
+ maxmdefnorm = Double.POSITIVE_INFINITY;
maxnormr = maxdist;
}
mdef_norm.putDouble(iditer, maxmdefnorm);
mdef_radius.putDouble(iditer, maxnormr);
minmax.put(maxmdefnorm);
- if(progressLOCI != null) {
- progressLOCI.incrementProcessed(LOG);
- }
- }
- if(progressLOCI != null) {
- progressLOCI.ensureCompleted(LOG);
+ LOG.incrementProcessed(progressLOCI);
}
- Relation<Double> scoreResult = new MaterializedRelation<>("LOCI normalized MDEF", "loci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs());
+ LOG.ensureCompleted(progressLOCI);
+ DoubleRelation scoreResult = new MaterializedDoubleRelation("LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0);
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
- result.addChildResult(new MaterializedRelation<>("LOCI MDEF Radius", "loci-critical-radius", TypeUtil.DOUBLE, mdef_radius, relation.getDBIDs()));
+ result.addChildResult(new MaterializedDoubleRelation("LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs()));
return result;
}
/**
- * Get the number of objects for a given radius, from the list of critical
- * distances, storing (radius, count) pairs.
+ * Preprocessing step: determine the radii of interest for each point.
*
- * @param criticalDistances
- * @param radius
- * @return Number of elements at the given radius
+ * @param ids IDs to process
+ * @param rangeQuery Range query
+ * @param interestingDistances Distances of interest
*/
- protected int elementsAtRadius(List<DoubleIntPair> criticalDistances, final double radius) {
- int n_r = 0;
- for(DoubleIntPair c2 : criticalDistances) {
- if(c2.first > radius) {
- break;
+ protected void precomputeInterestingRadii(DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) {
+ FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null;
+ for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax);
+ // build list of critical distances
+ DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1);
+ {
+ int i = 0;
+ DoubleDBIDListIter ni = neighbors.iter();
+ while(ni.valid()) {
+ final double curdist = ni.doubleValue();
+ ++i;
+ ni.advance();
+ // Skip, if tied to the next object:
+ if(ni.valid() && curdist == ni.doubleValue()) {
+ continue;
+ }
+ cdist.append(curdist, i);
+ // Scale radius, and reinsert
+ if(alpha != 1.) {
+ final double ri = curdist / alpha;
+ if(ri <= rmax) {
+ cdist.append(ri, Integer.MIN_VALUE);
+ }
+ }
+ }
}
- if(c2.second != Integer.MIN_VALUE) {
- // Update
- n_r = c2.second;
+ cdist.sort();
+
+ // fill the gaps to have fast lookups of number of neighbors at a given
+ // distance.
+ int lastk = 0;
+ for(int i = 0, size = cdist.size(); i < size; i++) {
+ final int k = cdist.getInt(i);
+ if(k == Integer.MIN_VALUE) {
+ cdist.setValue(i, lastk);
+ }
+ else {
+ lastk = k;
+ }
}
+ // TODO: shrink the list, removing duplicate radii?
+
+ interestingDistances.put(iditer, cdist);
+ LOG.incrementProcessed(progressPreproc);
+ }
+ LOG.ensureCompleted(progressPreproc);
+ }
+
+ /**
+ * Array of double-int values.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ protected static class DoubleIntArrayList {
+ /**
+ * Double keys
+ */
+ double[] keys;
+
+ /**
+ * Integer values
+ */
+ int[] vals;
+
+ /**
+ * Used size
+ */
+ int size = 0;
+
+ /**
+ * Constructor.
+ *
+ * @param alloc Initial allocation.
+ */
+ public DoubleIntArrayList(int alloc) {
+ keys = new double[alloc];
+ vals = new int[alloc];
+ size = 0;
+ }
+
+ /**
+ * Collection size.
+ *
+ * @return Size
+ */
+ public int size() {
+ return size;
+ }
+
+ /**
+ * Get the key at the given position.
+ *
+ * @param i Position
+ * @return Key
+ */
+ public double getDouble(int i) {
+ return keys[i];
+ }
+
+ /**
+ * Get the value at the given position.
+ *
+ * @param i Position
+ * @return Value
+ */
+ public int getInt(int i) {
+ return vals[i];
+ }
+
+ /**
+ * Get the value at the given position.
+ *
+ * @param i Position
+ * @param val New value
+ */
+ public void setValue(int i, int val) {
+ vals[i] = val;
+ }
+
+ /**
+ * Append a key-value pair.
+ *
+ * @param key Key to append
+ * @param val Value to append.
+ */
+ public void append(double key, int val) {
+ if(size == keys.length) {
+ keys = Arrays.copyOf(keys, size << 1);
+ vals = Arrays.copyOf(vals, size << 1);
+ }
+ keys[size] = key;
+ vals[size] = val;
+ ++size;
+ }
+
+ /**
+ * Find the last position with a smaller or equal key.
+ *
+ * @param search Key
+ * @return Position
+ */
+ public int find(final double search) {
+ int a = 0, b = size - 1;
+ while(a <= b) {
+ final int mid = (a + b) >>> 1;
+ final double cur = keys[mid];
+ if(cur > search) {
+ b = mid - 1;
+ }
+ else { // less or equal!
+ a = mid + 1;
+ }
+ }
+ return b;
+ }
+
+ /**
+ * Sort the array list.
+ */
+ public void sort() {
+ DoubleIntegerArrayQuickSort.sort(keys, vals, size);
}
- return n_r;
}
@Override
@@ -304,9 +416,11 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
* @author Erich Schubert
*
* @apiviz.exclude
+ *
+ * @param <O> Object type
*/
- public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
- protected D rmax = null;
+ public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> {
+ protected double rmax;
protected int nmin = 0;
@@ -315,15 +429,14 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- final D distanceFactory = (distanceFunction != null) ? distanceFunction.getDistanceFactory() : null;
- final DistanceParameter<D> rmaxP = new DistanceParameter<>(RMAX_ID, distanceFactory);
+ final DoubleParameter rmaxP = new DoubleParameter(RMAX_ID);
if(config.grab(rmaxP)) {
- rmax = rmaxP.getValue();
+ rmax = rmaxP.doubleValue();
}
final IntParameter nminP = new IntParameter(NMIN_ID, 20);
if(config.grab(nminP)) {
- nmin = nminP.getValue();
+ nmin = nminP.intValue();
}
final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5);
@@ -333,7 +446,7 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
}
@Override
- protected LOCI<O, D> makeInstance() {
+ protected LOCI<O> makeInstance() {
return new LOCI<>(distanceFunction, rmax, nmin, alpha);
}
}