diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java | 331 |
1 files changed, 222 insertions, 109 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java index e76c6034..8d371d4c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,9 +23,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.Arrays; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; @@ -37,15 +35,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; -import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; -import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; @@ -54,23 +52,24 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.DoubleIntegerArrayQuickSort; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; /** * Fast Outlier Detection Using the "Local Correlation Integral". * - * Exact implementation only, not aLOCI. See {@link ALOCI} + * Exact implementation only, not aLOCI. See {@link ALOCI}. * * Outlier detection using multiple epsilon neighborhoods. * + * This implementation has O(n<sup>3</sup> log n) runtime complexity! + * * Based on: S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos: * LOCI: Fast Outlier Detection Using the Local Correlation Integral. In: Proc. * 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003. @@ -80,13 +79,12 @@ import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; * @apiviz.has RangeQuery * * @param <O> Object type - * @param <D> Distance type */ @Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral") @Description("Algorithm to compute outliers based on the Local Correlation Integral") @Reference(authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802") -@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI"}) -public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm { +@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI" }) +public class LOCI<O> extends AbstractDistanceBasedAlgorithm<O, OutlierResult> implements OutlierAlgorithm { /** * The logger for this class. */ @@ -111,7 +109,7 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas /** * Holds the value of {@link #RMAX_ID}. */ - private D rmax; + private double rmax; /** * Holds the value of {@link #NMIN_ID}. @@ -131,7 +129,7 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas * @param nmin Minimum neighborhood size * @param alpha Alpha value */ - public LOCI(DistanceFunction<? super O, D> distanceFunction, D rmax, int nmin, double alpha) { + public LOCI(DistanceFunction<? super O> distanceFunction, double rmax, int nmin, double alpha) { super(distanceFunction); this.rmax = rmax; this.nmin = nmin; @@ -146,96 +144,62 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas * @return Outlier result */ public OutlierResult run(Database database, Relation<O> relation) { - DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); - RangeQuery<O, D> rangeQuery = database.getRangeQuery(distFunc); + DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); + RangeQuery<O> rangeQuery = database.getRangeQuery(distFunc); + DBIDs ids = relation.getDBIDs(); - FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", relation.size(), LOG) : null; // LOCI preprocessing step - WritableDataStore<ArrayList<DoubleIntPair>> interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, ArrayList.class); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(iditer, rmax); - // build list of critical distances - ArrayList<DoubleIntPair> cdist = new ArrayList<>(neighbors.size() << 1); - { - for(int i = 0; i < neighbors.size(); i++) { - DistanceDBIDPair<D> r = neighbors.get(i); - if(i + 1 < neighbors.size() && r.getDistance().compareTo(neighbors.get(i + 1).getDistance()) == 0) { - continue; - } - cdist.add(new DoubleIntPair(r.getDistance().doubleValue(), i)); - final double ri = r.getDistance().doubleValue() / alpha; - if(ri <= rmax.doubleValue()) { - cdist.add(new DoubleIntPair(ri, Integer.MIN_VALUE)); - } - } - } - Collections.sort(cdist); - // fill the gaps to have fast lookups of number of neighbors at a given - // distance. - int lastk = 0; - for(DoubleIntPair c : cdist) { - if(c.second == Integer.MIN_VALUE) { - c.second = lastk; - } - else { - lastk = c.second; - } - } - - interestingDistances.put(iditer, cdist); - if(progressPreproc != null) { - progressPreproc.incrementProcessed(LOG); - } - } - if(progressPreproc != null) { - progressPreproc.ensureCompleted(LOG); - } + WritableDataStore<DoubleIntArrayList> interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, DoubleIntArrayList.class); + precomputeInterestingRadii(ids, rangeQuery, interestingDistances); // LOCI main step FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); DoubleMinMax minmax = new DoubleMinMax(); - for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - final List<DoubleIntPair> cdist = interestingDistances.get(iditer); - final double maxdist = cdist.get(cdist.size() - 1).first; - final int maxneig = cdist.get(cdist.size() - 1).second; + // Shared instance, to save allocations. + MeanVariance mv_n_r_alpha = new MeanVariance(); + + for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + final DoubleIntArrayList cdist = interestingDistances.get(iditer); + final double maxdist = cdist.getDouble(cdist.size() - 1); + final int maxneig = cdist.getInt(cdist.size() - 1); double maxmdefnorm = 0.0; double maxnormr = 0; if(maxneig >= nmin) { - D range = distFunc.getDistanceFactory().fromDouble(maxdist); // Compute the largest neighborhood we will need. - DistanceDBIDList<D> maxneighbors = rangeQuery.getRangeForDBID(iditer, range); - // TODO: Ensure the set is sorted. Should be a no-op with most indexes. + DoubleDBIDList maxneighbors = rangeQuery.getRangeForDBID(iditer, maxdist); + // TODO: Ensure the result is sorted. This is currently implied. + // For any critical distance, compute the normalized MDEF score. - for(DoubleIntPair c : cdist) { + for(int i = 0, size = cdist.size(); i < size; i++) { // Only start when minimum size is fulfilled - if (c.second < nmin) { + if(cdist.getInt(i) < nmin) { continue; } - final double r = c.first; + final double r = cdist.getDouble(i); final double alpha_r = alpha * r; - // compute n(p_i, \alpha * r) from list (note: alpha_r is different from c!) - final int n_alphar = elementsAtRadius(cdist, alpha_r); + // compute n(p_i, \alpha * r) from list (note: alpha_r is not cdist!) + final int n_alphar = cdist.getInt(cdist.find(alpha_r)); // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} - MeanVariance mv_n_r_alpha = new MeanVariance(); - // TODO: optimize for double distances - for (DistanceDBIDListIter<D> neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { + mv_n_r_alpha.reset(); + for(DoubleDBIDListIter neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { // Stop at radius r - if(neighbor.getDistance().doubleValue() > r) { + if(neighbor.doubleValue() > r) { break; } - int rn_alphar = elementsAtRadius(interestingDistances.get(neighbor), alpha_r); + DoubleIntArrayList cdist2 = interestingDistances.get(neighbor); + int rn_alphar = cdist2.getInt(cdist2.find(alpha_r)); mv_n_r_alpha.put(rn_alphar); } // We only use the average and standard deviation final double nhat_r_alpha = mv_n_r_alpha.getMean(); final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); - // Redundant divisions removed. - final double mdef = (nhat_r_alpha - n_alphar); // / nhat_r_alpha; - final double sigmamdef = sigma_nhat_r_alpha; // / nhat_r_alpha; + // Redundant divisions by nhat_r_alpha removed. + final double mdef = nhat_r_alpha - n_alphar; + final double sigmamdef = sigma_nhat_r_alpha; final double mdefnorm = mdef / sigmamdef; if(mdefnorm > maxmdefnorm) { @@ -246,46 +210,194 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas } else { // FIXME: when nmin was not fulfilled - what is the proper value then? - maxmdefnorm = 1.0; + maxmdefnorm = Double.POSITIVE_INFINITY; maxnormr = maxdist; } mdef_norm.putDouble(iditer, maxmdefnorm); mdef_radius.putDouble(iditer, maxnormr); minmax.put(maxmdefnorm); - if(progressLOCI != null) { - progressLOCI.incrementProcessed(LOG); - } - } - if(progressLOCI != null) { - progressLOCI.ensureCompleted(LOG); + LOG.incrementProcessed(progressLOCI); } - Relation<Double> scoreResult = new MaterializedRelation<>("LOCI normalized MDEF", "loci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs()); + LOG.ensureCompleted(progressLOCI); + DoubleRelation scoreResult = new MaterializedDoubleRelation("LOCI normalized MDEF", "loci-mdef-outlier", mdef_norm, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); OutlierResult result = new OutlierResult(scoreMeta, scoreResult); - result.addChildResult(new MaterializedRelation<>("LOCI MDEF Radius", "loci-critical-radius", TypeUtil.DOUBLE, mdef_radius, relation.getDBIDs())); + result.addChildResult(new MaterializedDoubleRelation("LOCI MDEF Radius", "loci-critical-radius", mdef_radius, relation.getDBIDs())); return result; } /** - * Get the number of objects for a given radius, from the list of critical - * distances, storing (radius, count) pairs. + * Preprocessing step: determine the radii of interest for each point. * - * @param criticalDistances - * @param radius - * @return Number of elements at the given radius + * @param ids IDs to process + * @param rangeQuery Range query + * @param interestingDistances Distances of interest */ - protected int elementsAtRadius(List<DoubleIntPair> criticalDistances, final double radius) { - int n_r = 0; - for(DoubleIntPair c2 : criticalDistances) { - if(c2.first > radius) { - break; + protected void precomputeInterestingRadii(DBIDs ids, RangeQuery<O> rangeQuery, WritableDataStore<DoubleIntArrayList> interestingDistances) { + FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", ids.size(), LOG) : null; + for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, rmax); + // build list of critical distances + DoubleIntArrayList cdist = new DoubleIntArrayList(neighbors.size() << 1); + { + int i = 0; + DoubleDBIDListIter ni = neighbors.iter(); + while(ni.valid()) { + final double curdist = ni.doubleValue(); + ++i; + ni.advance(); + // Skip, if tied to the next object: + if(ni.valid() && curdist == ni.doubleValue()) { + continue; + } + cdist.append(curdist, i); + // Scale radius, and reinsert + if(alpha != 1.) { + final double ri = curdist / alpha; + if(ri <= rmax) { + cdist.append(ri, Integer.MIN_VALUE); + } + } + } } - if(c2.second != Integer.MIN_VALUE) { - // Update - n_r = c2.second; + cdist.sort(); + + // fill the gaps to have fast lookups of number of neighbors at a given + // distance. + int lastk = 0; + for(int i = 0, size = cdist.size(); i < size; i++) { + final int k = cdist.getInt(i); + if(k == Integer.MIN_VALUE) { + cdist.setValue(i, lastk); + } + else { + lastk = k; + } } + // TODO: shrink the list, removing duplicate radii? + + interestingDistances.put(iditer, cdist); + LOG.incrementProcessed(progressPreproc); + } + LOG.ensureCompleted(progressPreproc); + } + + /** + * Array of double-int values. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected static class DoubleIntArrayList { + /** + * Double keys + */ + double[] keys; + + /** + * Integer values + */ + int[] vals; + + /** + * Used size + */ + int size = 0; + + /** + * Constructor. + * + * @param alloc Initial allocation. + */ + public DoubleIntArrayList(int alloc) { + keys = new double[alloc]; + vals = new int[alloc]; + size = 0; + } + + /** + * Collection size. + * + * @return Size + */ + public int size() { + return size; + } + + /** + * Get the key at the given position. + * + * @param i Position + * @return Key + */ + public double getDouble(int i) { + return keys[i]; + } + + /** + * Get the value at the given position. + * + * @param i Position + * @return Value + */ + public int getInt(int i) { + return vals[i]; + } + + /** + * Get the value at the given position. + * + * @param i Position + * @param val New value + */ + public void setValue(int i, int val) { + vals[i] = val; + } + + /** + * Append a key-value pair. + * + * @param key Key to append + * @param val Value to append. + */ + public void append(double key, int val) { + if(size == keys.length) { + keys = Arrays.copyOf(keys, size << 1); + vals = Arrays.copyOf(vals, size << 1); + } + keys[size] = key; + vals[size] = val; + ++size; + } + + /** + * Find the last position with a smaller or equal key. + * + * @param search Key + * @return Position + */ + public int find(final double search) { + int a = 0, b = size - 1; + while(a <= b) { + final int mid = (a + b) >>> 1; + final double cur = keys[mid]; + if(cur > search) { + b = mid - 1; + } + else { // less or equal! + a = mid + 1; + } + } + return b; + } + + /** + * Sort the array list. + */ + public void sort() { + DoubleIntegerArrayQuickSort.sort(keys, vals, size); } - return n_r; } @Override @@ -304,9 +416,11 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas * @author Erich Schubert * * @apiviz.exclude + * + * @param <O> Object type */ - public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { - protected D rmax = null; + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { + protected double rmax; protected int nmin = 0; @@ -315,15 +429,14 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - final D distanceFactory = (distanceFunction != null) ? distanceFunction.getDistanceFactory() : null; - final DistanceParameter<D> rmaxP = new DistanceParameter<>(RMAX_ID, distanceFactory); + final DoubleParameter rmaxP = new DoubleParameter(RMAX_ID); if(config.grab(rmaxP)) { - rmax = rmaxP.getValue(); + rmax = rmaxP.doubleValue(); } final IntParameter nminP = new IntParameter(NMIN_ID, 20); if(config.grab(nminP)) { - nmin = nminP.getValue(); + nmin = nminP.intValue(); } final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5); @@ -333,7 +446,7 @@ public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas } @Override - protected LOCI<O, D> makeInstance() { + protected LOCI<O> makeInstance() { return new LOCI<>(distanceFunction, rmax, nmin, alpha); } } |