diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof')
12 files changed, 4250 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java new file mode 100644 index 00000000..d48679a9 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java @@ -0,0 +1,737 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * Fast Outlier Detection Using the "approximate Local Correlation Integral".
+ *
+ * Outlier detection using multiple epsilon neighborhoods.
+ *
+ * Reference:
+ * <p>
+ * S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos:<br />
+ * LOCI: Fast Outlier Detection Using the Local Correlation Integral.<br />
+ * In: Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore,
+ * India, 2003.
+ * </p>
+ *
+ * @author Jonathan von Brünken
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf ALOCIQuadTree
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+@Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral")
+@Description("Algorithm to compute outliers based on the Local Correlation Integral")
+@Reference(authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802")
+public class ALOCI<O extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(ALOCI.class);
+
+ /**
+ * Minimum size for a leaf.
+ */
+ private int nmin;
+
+ /**
+ * Alpha (level difference of sampling and counting neighborhoods)
+ */
+ private int alpha;
+
+ /**
+ * Number of trees to generate (forest size)
+ */
+ private int g;
+
+ /**
+ * Random generator
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Distance function
+ */
+ private NumberVectorDistanceFunction<D> distFunc;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function
+ * @param nmin Minimum neighborhood size
+ * @param alpha Alpha value
+ * @param g Number of grids to use
+ * @param rnd Random generator.
+ */
+ public ALOCI(NumberVectorDistanceFunction<D> distanceFunction, int nmin, int alpha, int g, RandomFactory rnd) {
+ super();
+ this.distFunc = distanceFunction;
+ this.nmin = nmin;
+ this.alpha = alpha;
+ this.g = g;
+ this.rnd = rnd;
+ }
+
+ public OutlierResult run(Database database, Relation<O> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+ final Random random = rnd.getRandom();
+ FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("Build aLOCI quadtress", g, LOG) : null;
+
+ // Compute extend of dataset.
+ double[] min, max;
+ {
+ Pair<O, O> hbbs = DatabaseUtil.computeMinMax(relation);
+ double maxd = 0;
+ min = new double[dim];
+ max = new double[dim];
+ for(int i = 0; i < dim; i++) {
+ min[i] = hbbs.first.doubleValue(i);
+ max[i] = hbbs.second.doubleValue(i);
+ maxd = Math.max(maxd, max[i] - min[i]);
+ }
+ // Enlarge bounding box to have equal lengths.
+ for(int i = 0; i < dim; i++) {
+ double diff = (maxd - (max[i] - min[i])) * .5;
+ min[i] -= diff;
+ max[i] += diff;
+ }
+ }
+
+ List<ALOCIQuadTree> qts = new ArrayList<>(g);
+
+ double[] nshift = new double[dim];
+ ALOCIQuadTree qt = new ALOCIQuadTree(min, max, nshift, nmin, relation);
+ qts.add(qt);
+ if(progressPreproc != null) {
+ progressPreproc.incrementProcessed(LOG);
+ }
+ /*
+ * create the remaining g-1 shifted QuadTrees. This not clearly described in
+ * the paper and therefore implemented in a way that achieves good results
+ * with the test data.
+ */
+ for(int shift = 1; shift < g; shift++) {
+ double[] svec = new double[dim];
+ for(int i = 0; i < dim; i++) {
+ svec[i] = random.nextDouble() * (max[i] - min[i]);
+ }
+ qt = new ALOCIQuadTree(min, max, svec, nmin, relation);
+ qts.add(qt);
+ if(progressPreproc != null) {
+ progressPreproc.incrementProcessed(LOG);
+ }
+ }
+ if(progressPreproc != null) {
+ progressPreproc.ensureCompleted(LOG);
+ }
+
+ // aLOCI main loop: evaluate
+ FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("Compute aLOCI scores", relation.size(), LOG) : null;
+ WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
+ DoubleMinMax minmax = new DoubleMinMax();
+
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ final O obj = relation.get(iditer);
+
+ double maxmdefnorm = 0;
+ // For each level
+ for(int l = 0;; l++) {
+ // Find the closest C_i
+ Node ci = null;
+ for(int i = 0; i < g; i++) {
+ Node ci2 = qts.get(i).findClosestNode(obj, l);
+ if(ci2.getLevel() != l) {
+ continue;
+ }
+ // TODO: always use manhattan?
+ if(ci == null || distFunc.distance(ci.getCenter(), obj).compareTo(distFunc.distance(ci2.getCenter(), obj)) > 0) {
+ ci = ci2;
+ }
+ }
+ // logger.debug("level:" + (ci != null ? ci.getLevel() : -1) +" l:"+l);
+ if(ci == null) {
+ break; // no matching tree for this level.
+ }
+
+ // Find the closest C_j
+ Node cj = null;
+ for(int i = 0; i < g; i++) {
+ Node cj2 = qts.get(i).findClosestNode(ci.getCenter(), l - alpha);
+ // TODO: allow higher levels or not?
+ if(cj != null && cj2.getLevel() < cj.getLevel()) {
+ continue;
+ }
+ // TODO: always use manhattan?
+ if(cj == null || distFunc.distance(cj.getCenter(), ci.getCenter()).compareTo(distFunc.distance(cj2.getCenter(), ci.getCenter())) > 0) {
+ cj = cj2;
+ }
+ }
+ // logger.debug("level:" + (cj != null ? cj.getLevel() : -1) +" l:"+l);
+ if(cj == null) {
+ continue; // no matching tree for this level.
+ }
+ double mdefnorm = calculate_MDEF_norm(cj, ci);
+ // logger.warning("level:" + ci.getLevel() + "/" + cj.getLevel() +
+ // " mdef: " + mdefnorm);
+ maxmdefnorm = Math.max(maxmdefnorm, mdefnorm);
+ }
+ // Store results
+ mdef_norm.putDouble(iditer, maxmdefnorm);
+ minmax.put(maxmdefnorm);
+ if(progressLOCI != null) {
+ progressLOCI.incrementProcessed(LOG);
+ }
+ }
+ if(progressLOCI != null) {
+ progressLOCI.ensureCompleted(LOG);
+ }
+ Relation<Double> scoreResult = new MaterializedRelation<>("aLOCI normalized MDEF", "aloci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs());
+ OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY);
+ OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
+ return result;
+ }
+
+ /**
+ * Method for the MDEF calculation
+ *
+ * @param sn Sampling Neighborhood
+ * @param cg Counting Neighborhood
+ *
+ * @return MDEF norm
+ */
+ private static double calculate_MDEF_norm(Node sn, Node cg) {
+ // get the square sum of the counting neighborhoods box counts
+ long sq = sn.getSquareSum(cg.getLevel() - sn.getLevel());
+ /*
+ * if the square sum is equal to box count of the sampling Neighborhood then
+ * n_hat is equal one, and as cg needs to have at least one Element mdef
+ * would get zero or lower than zero. This is the case when all of the
+ * counting Neighborhoods contain one or zero Objects. Additionally, the
+ * cubic sum, square sum and sampling Neighborhood box count are all equal,
+ * which leads to sig_n_hat being zero and thus mdef_norm is either negative
+ * infinite or undefined. As the distribution of the Objects seem quite
+ * uniform, a mdef_norm value of zero ( = no outlier) is appropriate and
+ * circumvents the problem of undefined values.
+ */
+ if(sq == sn.getCount()) {
+ return 0.0;
+ }
+ // calculation of mdef according to the paper and standardization as done in
+ // LOCI
+ long cb = sn.getCubicSum(cg.getLevel() - sn.getLevel());
+ double n_hat = (double) sq / sn.getCount();
+ double sig_n_hat = java.lang.Math.sqrt(cb * sn.getCount() - (sq * sq)) / sn.getCount();
+ // Avoid NaN - correct result 0.0?
+ if(sig_n_hat < Double.MIN_NORMAL) {
+ return 0.0;
+ }
+ double mdef = n_hat - cg.getCount();
+ return mdef / sig_n_hat;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(distFunc.getInputTypeRestriction());
+ }
+
+ /**
+ * Simple quadtree for ALOCI. Not storing the actual objects, just the counts.
+ *
+ * Furthermore, the quadtree can be shifted by a specified vector, wrapping
+ * around min/max
+ *
+ * @author Jonathan von Brünken
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf Node
+ */
+ static class ALOCIQuadTree {
+ /**
+ * Tree parameters
+ */
+ private double[] shift, min, width;
+
+ /**
+ * Maximum fill for a page before splitting
+ */
+ private int nmin;
+
+ /**
+ * Tree root
+ */
+ Node root;
+
+ /**
+ * Relation indexed.
+ */
+ private Relation<? extends NumberVector<?>> relation;
+
+ /**
+ * Constructor.
+ *
+ * @param min Minimum coordinates
+ * @param max Maximum coordinates
+ * @param shift Tree shift offset
+ * @param nmin Maximum size for a page to split
+ * @param relation Relation to index
+ */
+ public ALOCIQuadTree(double[] min, double[] max, double[] shift, int nmin, Relation<? extends NumberVector<?>> relation) {
+ super();
+ assert (min.length <= 32) : "Quadtrees are only supported for up to 32 dimensions";
+ this.shift = shift;
+ this.nmin = nmin;
+ this.min = min;
+ this.width = new double[min.length];
+ for(int d = 0; d < min.length; d++) {
+ width[d] = max[d] - min[d];
+ if(width[d] <= 0) {
+ width[d] = 1;
+ }
+ }
+ double[] center = new double[min.length];
+ for(int d = 0; d < min.length; d++) {
+ if(shift[d] < width[d] * .5) {
+ center[d] = min[d] + shift[d] + width[d] * .5;
+ }
+ else {
+ center[d] = min[d] + shift[d] - width[d] * .5;
+ }
+ }
+ this.relation = relation;
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ List<Node> children = new ArrayList<>();
+ bulkLoad(min.clone(), max.clone(), children, ids, 0, ids.size(), 0, 0, 0);
+ this.root = new Node(0, new Vector(center), ids.size(), -1, children);
+ }
+
+ /**
+ * Bulk load the tree
+ *
+ * @param lmin Subtree minimum (unshifted, will be modified)
+ * @param lmax Subtree maximum (unshifted, will be modified)
+ * @param children List of children for current parent
+ * @param ids IDs to process
+ * @param start Start of ids subinterval
+ * @param end End of ids subinterval
+ * @param dim Current dimension
+ * @param level Current tree level
+ * @param code Bit code of node position
+ */
+ private void bulkLoad(double[] lmin, double[] lmax, List<Node> children, ArrayModifiableDBIDs ids, int start, int end, int dim, int level, int code) {
+ // logger.warning(FormatUtil.format(lmin)+" "+FormatUtil.format(lmax)+" "+start+"->"+end+" "+(end-start));
+ // Hack: Check degenerate cases that won't split
+ if(dim == 0) {
+ DBIDArrayIter iter = ids.iter();
+ iter.seek(start);
+ NumberVector<?> first = relation.get(iter);
+ iter.advance();
+ boolean degenerate = true;
+ loop: for(; iter.getOffset() < end; iter.advance()) {
+ NumberVector<?> other = relation.get(iter);
+ for(int d = 0; d < lmin.length; d++) {
+ if(Math.abs(first.doubleValue(d) - other.doubleValue(d)) > 1E-15) {
+ degenerate = false;
+ break loop;
+ }
+ }
+ }
+ if(degenerate) {
+ double[] center = new double[lmin.length];
+ for(int d = 0; d < lmin.length; d++) {
+ center[d] = lmin[d] * .5 + lmax[d] * .5 + shift[d];
+ if(center[d] > min[d] + width[d]) {
+ center[d] -= width[d];
+ }
+ }
+ children.add(new Node(code, new Vector(center), end - start, level, null));
+ return;
+ }
+ }
+ // Complete level
+ if(dim == lmin.length) {
+ double[] center = new double[lmin.length];
+ for(int d = 0; d < lmin.length; d++) {
+ center[d] = lmin[d] * .5 + lmax[d] * .5 + shift[d];
+ if(center[d] > min[d] + width[d]) {
+ center[d] -= width[d];
+ }
+ }
+ if(end - start < nmin) {
+ children.add(new Node(code, new Vector(center), end - start, level, null));
+ return;
+ }
+ else {
+ List<Node> newchildren = new ArrayList<>();
+ bulkLoad(lmin, lmax, newchildren, ids, start, end, 0, level + 1, 0);
+ children.add(new Node(code, new Vector(center), end - start, level, newchildren));
+ return;
+ }
+ }
+ else {
+ // Partially sort data, by dimension dim < mid
+ DBIDArrayIter siter = ids.iter(), eiter = ids.iter();
+ siter.seek(start);
+ eiter.seek(end - 1);
+ while(siter.getOffset() < eiter.getOffset()) {
+ if(getShiftedDim(relation.get(siter), dim, level) <= .5) {
+ siter.advance();
+ continue;
+ }
+ if(getShiftedDim(relation.get(eiter), dim, level) > 0.5) {
+ eiter.retract();
+ continue;
+ }
+ ids.swap(siter.getOffset(), eiter.getOffset() - 1);
+ siter.advance();
+ eiter.retract();
+ }
+ final int spos = siter.getOffset();
+ if(start < spos) {
+ final double tmp = lmax[dim];
+ lmax[dim] = lmax[dim] * .5 + lmin[dim] * .5;
+ bulkLoad(lmin, lmax, children, ids, start, spos, dim + 1, level, code);
+ lmax[dim] = tmp; // Restore
+ }
+ if(spos < end) {
+ final double tmp = lmin[dim];
+ lmin[dim] = lmax[dim] * .5 + lmin[dim] * .5;
+ bulkLoad(lmin, lmax, children, ids, spos, end, dim + 1, level, code | (1 << dim));
+ lmin[dim] = tmp; // Restore
+ }
+ }
+ }
+
+ /**
+ * Shift and wrap a single dimension.
+ *
+ * @param obj Object
+ * @param dim Dimension
+ * @param level Level (controls scaling/wraping!)
+ * @return Shifted position
+ */
+ private double getShiftedDim(NumberVector<?> obj, int dim, int level) {
+ double pos = obj.doubleValue(dim) + shift[dim];
+ pos = (pos - min[dim]) / width[dim] * (1 + level);
+ return pos - Math.floor(pos);
+ }
+
+ /**
+ * Find the closest node (of depth tlevel or above, if there is no node at
+ * this depth) for the given vector.
+ *
+ * @param vec Query vector
+ * @param tlevel Target level
+ * @return Node
+ */
+ public Node findClosestNode(NumberVector<?> vec, int tlevel) {
+ Node cur = root;
+ for(int level = 0; level <= tlevel; level++) {
+ if(cur.children == null) {
+ break;
+ }
+ int code = 0;
+ for(int d = 0; d < min.length; d++) {
+ if(getShiftedDim(vec, d, level) > .5) {
+ code |= 1 << d;
+ }
+ }
+ boolean found = false;
+ for(Node child : cur.children) {
+ if(child.code == code) {
+ cur = child;
+ found = true;
+ break;
+ }
+ }
+ if(!found) {
+ break; // Do not descend
+ }
+ }
+ return cur;
+ }
+ }
+
+ /**
+ * Node of the ALOCI Quadtree
+ *
+ * @author Erich Schubert
+ */
+ static class Node {
+ /**
+ * Position code
+ */
+ final int code;
+
+ /**
+ * Number of elements
+ */
+ final int count;
+
+ /**
+ * Level of node
+ */
+ final int level;
+
+ /**
+ * Child nodes, may be null
+ */
+ List<Node> children;
+
+ /**
+ * Parent node
+ */
+ Node parent = null;
+
+ /**
+ * Center vector
+ */
+ Vector center;
+
+ /**
+ * Constructor.
+ *
+ * @param code Node code
+ * @param center Center vector
+ * @param count Element count
+ * @param level Node level
+ * @param children Children list
+ */
+ protected Node(int code, Vector center, int count, int level, List<Node> children) {
+ this.code = code;
+ this.center = center;
+ this.count = count;
+ this.level = level;
+ this.children = children;
+ if(children != null) {
+ for(Node child : children) {
+ child.parent = this;
+ }
+ }
+ }
+
+ /**
+ * Get level of node.
+ *
+ * @return Level of node
+ */
+ public int getLevel() {
+ return level;
+ }
+
+ /**
+ * Get count of subtree
+ *
+ * @return subtree count
+ */
+ public int getCount() {
+ return count;
+ }
+
+ /**
+ * Return center vector
+ *
+ * @return center vector
+ */
+ public Vector getCenter() {
+ return center;
+ }
+
+ /**
+ * Get sum of squares, recursively
+ *
+ * @param levels Depth to collect
+ * @return Sum of squares
+ */
+ public long getSquareSum(int levels) {
+ if(levels <= 0 || children == null) {
+ return ((long) count) * ((long) count);
+ }
+ long agg = 0;
+ for(Node child : children) {
+ agg += child.getSquareSum(levels - 1);
+ }
+ return agg;
+ }
+
+ /**
+ * Get cubic sum.
+ *
+ * @param levels Level to collect
+ * @return sum of cubes
+ */
+ public long getCubicSum(int levels) {
+ if(levels <= 0 || children == null) {
+ return ((long) count) * ((long) count) * ((long) count);
+ }
+ long agg = 0;
+ for(Node child : children) {
+ agg += child.getCubicSum(levels - 1);
+ }
+ return agg;
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the minimum neighborhood size
+ */
+ public static final OptionID NMIN_ID = new OptionID("loci.nmin", "Minimum neighborhood size to be considered.");
+
+ /**
+ * Parameter to specify the averaging neighborhood scaling.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("loci.alpha", "Scaling factor for averaging neighborhood");
+
+ /**
+ * Parameter to specify the number of Grids to use.
+ */
+ public static final OptionID GRIDS_ID = new OptionID("loci.g", "The number of Grids to use.");
+
+ /**
+ * Parameter to specify the seed to initialize Random.
+ */
+ public static final OptionID SEED_ID = new OptionID("loci.seed", "The seed to use for initializing Random.");
+
+ /**
+ * Neighborhood minimum size
+ */
+ protected int nmin = 0;
+
+ /**
+ * Alpha: number of levels difference to use in comparison
+ */
+ protected int alpha = 4;
+
+ /**
+ * G: number of shifted trees to create.
+ */
+ protected int g = 1;
+
+ /**
+ * Random generator
+ */
+ protected RandomFactory rnd;
+
+ /**
+ * The distance function
+ */
+ private NumberVectorDistanceFunction<D> distanceFunction;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ ObjectParameter<NumberVectorDistanceFunction<D>> distanceFunctionP = makeParameterDistanceFunction(EuclideanDistanceFunction.class, NumberVectorDistanceFunction.class);
+ if(config.grab(distanceFunctionP)) {
+ distanceFunction = distanceFunctionP.instantiateClass(config);
+ }
+
+ final IntParameter nminP = new IntParameter(NMIN_ID, 20);
+ if(config.grab(nminP)) {
+ nmin = nminP.getValue();
+ }
+
+ final IntParameter g = new IntParameter(GRIDS_ID, 1);
+ if(config.grab(g)) {
+ this.g = g.getValue();
+ }
+
+ final RandomParameter rndP = new RandomParameter(SEED_ID);
+ if(config.grab(rndP)) {
+ this.rnd = rndP.getValue();
+ }
+
+ final IntParameter alphaP = new IntParameter(ALPHA_ID, 4);
+ if(config.grab(alphaP)) {
+ this.alpha = alphaP.getValue();
+ if(this.alpha < 1) {
+ this.alpha = 1;
+ }
+ }
+ }
+
+ @Override
+ protected ALOCI<O, D> makeInstance() {
+ return new ALOCI<>(distanceFunction, nmin, alpha, g, rnd);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java new file mode 100644 index 00000000..80f60e8b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java @@ -0,0 +1,612 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceKNNList; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; +import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; +import de.lmu.ifi.dbs.elki.database.query.rknn.RKNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * <p> + * Flexible variant of the "Local Outlier Factor" algorithm. + * </p> + * + * <p> + * This implementation diverts from the original LOF publication in that it + * allows the user to use a different distance function for the reachability + * distance and neighborhood determination (although the default is to use the + * same value.) + * </p> + * + * <p> + * The k nearest neighbors are determined using the parameter + * {@link de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm#DISTANCE_FUNCTION_ID} + * , while the reference set used in reachability distance computation is + * configured using {@link Parameterizer#REACHABILITY_DISTANCE_FUNCTION_ID}. + * </p> + * + * <p> + * The original LOF parameter was called "minPts". For consistency + * with the name "kNN query", we chose to rename the parameter to {@code k}. + * Flexible LOF allows you to set the two values different, which yields the + * parameters {@link Parameterizer#KREF_ID} ({@code -lof.krefer}) and + * {@link Parameterizer#KREACH_ID} ({@code -lof.kreach}) + * </p> + * + * <p> + * Reference: <br> + * M. M. Breunig, H.-P. Kriegel, R. Ng, J. Sander: LOF: Identifying + * Density-Based Local Outliers. <br> + * In: Proc. 2nd ACM SIGMOD Int. Conf. on Management of Data (SIGMOD'00), + * Dallas, TX, 2000. + * </p> + * + * @author Peer Kröger + * @author Erich Schubert + * @author Elke Achtert + * + * @apiviz.has LOFResult oneway - - computes + * @apiviz.has KNNQuery + * + * @param <O> the type of DatabaseObjects handled by this Algorithm + * @param <D> Distance type + */ +@Title("LOF: Local Outlier Factor") +@Description("Algorithm to compute density-based local outlier factors in a database based on the neighborhood size parameter 'k'") +@Reference(authors = "M. M. Breunig, H.-P. Kriegel, R. Ng, and J. Sander", title = "LOF: Identifying Density-Based Local Outliers", booktitle = "Proc. 2nd ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '00), Dallas, TX, 2000", url = "http://dx.doi.org/10.1145/342009.335388") +public class FlexibleLOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(FlexibleLOF.class); + + /** + * Number of neighbors in comparison set. + */ + protected int krefer = 2; + + /** + * Number of neighbors used for reachability distance. + */ + protected int kreach = 2; + + /** + * Neighborhood distance function. + */ + protected DistanceFunction<? super O, D> referenceDistanceFunction; + + /** + * Reachability distance function. + */ + protected DistanceFunction<? super O, D> reachabilityDistanceFunction; + + /** + * Include object itself in kNN neighborhood. + * + * In the official LOF publication, the point itself is not considered to be + * part of its k nearest neighbors. + */ + private static boolean objectIsInKNN = false; + + /** + * Constructor. + * + * @param krefer The number of neighbors for reference + * @param kreach The number of neighbors for reachability distance + * @param neighborhoodDistanceFunction the neighborhood distance function + * @param reachabilityDistanceFunction the reachability distance function + */ + public FlexibleLOF(int krefer, int kreach, DistanceFunction<? super O, D> neighborhoodDistanceFunction, DistanceFunction<? super O, D> reachabilityDistanceFunction) { + super(); + this.krefer = krefer + (objectIsInKNN ? 0 : 1); + this.kreach = kreach + (objectIsInKNN ? 0 : 1); + this.referenceDistanceFunction = neighborhoodDistanceFunction; + this.reachabilityDistanceFunction = reachabilityDistanceFunction; + } + + /** + * Performs the Generalized LOF algorithm on the given database by calling + * {@link #doRunInTime}. + * + * @param database Database to query + * @param relation Data to process + * @return LOF outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + StepProgress stepprog = LOG.isVerbose() ? new StepProgress("LOF", 3) : null; + Pair<KNNQuery<O, D>, KNNQuery<O, D>> pair = getKNNQueries(database, relation, stepprog); + KNNQuery<O, D> kNNRefer = pair.getFirst(); + KNNQuery<O, D> kNNReach = pair.getSecond(); + return doRunInTime(relation.getDBIDs(), kNNRefer, kNNReach, stepprog).getResult(); + } + + /** + * Get the kNN queries for the algorithm. + * + * @param relation the data + * @param stepprog the progress logger + * @return the kNN queries for the algorithm + */ + private Pair<KNNQuery<O, D>, KNNQuery<O, D>> getKNNQueries(Database database, Relation<O> relation, StepProgress stepprog) { + // "HEAVY" flag for knnReach since it is used more than once + KNNQuery<O, D> knnReach = QueryUtil.getKNNQuery(relation, reachabilityDistanceFunction, kreach, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); + // No optimized kNN query - use a preprocessor! + if (!(knnReach instanceof PreprocessorKNNQuery)) { + if (stepprog != null) { + if (referenceDistanceFunction.equals(reachabilityDistanceFunction)) { + stepprog.beginStep(1, "Materializing neighborhoods w.r.t. reference neighborhood distance function.", LOG); + } else { + stepprog.beginStep(1, "Not materializing neighborhoods w.r.t. reference neighborhood distance function, but materializing neighborhoods w.r.t. reachability distance function.", LOG); + } + } + int kpreproc = (referenceDistanceFunction.equals(reachabilityDistanceFunction)) ? Math.max(kreach, krefer) : kreach; + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, reachabilityDistanceFunction, kpreproc); + database.addIndex(preproc); + DistanceQuery<O, D> rdq = database.getDistanceQuery(relation, reachabilityDistanceFunction); + knnReach = preproc.getKNNQuery(rdq, kreach); + } + + // knnReach is only used once + KNNQuery<O, D> knnRefer; + if (referenceDistanceFunction == reachabilityDistanceFunction || referenceDistanceFunction.equals(reachabilityDistanceFunction)) { + knnRefer = knnReach; + } else { + // do not materialize the first neighborhood, since it is used only once + knnRefer = QueryUtil.getKNNQuery(relation, referenceDistanceFunction, krefer); + } + + return new Pair<>(knnRefer, knnReach); + } + + /** + * Performs the Generalized LOF_SCORE algorithm on the given database and + * returns a {@link FlexibleLOF.LOFResult} encapsulating information that may + * be needed by an OnlineLOF algorithm. + * + * @param ids Object ids + * @param kNNRefer the kNN query w.r.t. reference neighborhood distance + * function + * @param kNNReach the kNN query w.r.t. reachability distance function + * @param stepprog Progress logger + * @return LOF result + */ + protected LOFResult<O, D> doRunInTime(DBIDs ids, KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, StepProgress stepprog) { + // Assert we got something + if (kNNRefer == null) { + throw new AbortException("No kNN queries supported by database for reference neighborhood distance function."); + } + if (kNNReach == null) { + throw new AbortException("No kNN queries supported by database for reachability distance function."); + } + + // Compute LRDs + if (stepprog != null) { + stepprog.beginStep(2, "Computing LRDs.", LOG); + } + WritableDoubleDataStore lrds = computeLRDs(ids, kNNReach); + + // compute LOF_SCORE of each db object + if (stepprog != null) { + stepprog.beginStep(3, "Computing LOFs.", LOG); + } + Pair<WritableDoubleDataStore, DoubleMinMax> lofsAndMax = computeLOFs(ids, lrds, kNNRefer); + WritableDoubleDataStore lofs = lofsAndMax.getFirst(); + // track the maximum value for normalization. + DoubleMinMax lofminmax = lofsAndMax.getSecond(); + + if (stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + + return new LOFResult<>(result, kNNRefer, kNNReach, lrds, lofs); + } + + /** + * Computes the local reachability density (LRD) of the specified objects. + * + * @param ids the ids of the objects + * @param knnReach the precomputed neighborhood of the objects w.r.t. the + * reachability distance + * @return the LRDs of the objects + */ + protected WritableDoubleDataStore computeLRDs(DBIDs ids, KNNQuery<O, D> knnReach) { + WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("LRD", ids.size(), LOG) : null; + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final KNNList<D> neighbors = knnReach.getKNNForDBID(iter, kreach); + double sum = 0.0; + int count = 0; + if (neighbors instanceof DoubleDistanceKNNList) { + // Fast version for double distances + for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if (objectIsInKNN || !DBIDUtil.equal(neighbor, iter)) { + KNNList<D> neighborsNeighbors = knnReach.getKNNForDBID(neighbor, kreach); + final double nkdist; + if (neighborsNeighbors instanceof DoubleDistanceKNNList) { + nkdist = ((DoubleDistanceKNNList) neighborsNeighbors).doubleKNNDistance(); + } else { + nkdist = neighborsNeighbors.getKNNDistance().doubleValue(); + } + sum += Math.max(neighbor.doubleDistance(), nkdist); + count++; + } + } + } else { + for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (objectIsInKNN || !DBIDUtil.equal(neighbor, iter)) { + KNNList<D> neighborsNeighbors = knnReach.getKNNForDBID(neighbor, kreach); + sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue()); + count++; + } + } + } + // Avoid division by 0 + final double lrd = (sum > 0) ? (count / sum) : Double.POSITIVE_INFINITY; + lrds.putDouble(iter, lrd); + if (lrdsProgress != null) { + lrdsProgress.incrementProcessed(LOG); + } + } + if (lrdsProgress != null) { + lrdsProgress.ensureCompleted(LOG); + } + return lrds; + } + + /** + * Computes the Local outlier factor (LOF) of the specified objects. + * + * @param ids the ids of the objects + * @param lrds the LRDs of the objects + * @param knnRefer the precomputed neighborhood of the objects w.r.t. the + * reference distance + * @return the LOFs of the objects and the maximum LOF + */ + protected Pair<WritableDoubleDataStore, DoubleMinMax> computeLOFs(DBIDs ids, DoubleDataStore lrds, KNNQuery<O, D> knnRefer) { + WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + // track the maximum value for normalization. + DoubleMinMax lofminmax = new DoubleMinMax(); + + FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), LOG) : null; + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final double lrdp = lrds.doubleValue(iter); + final double lof; + if (lrdp > 0 && !Double.isInfinite(lrdp)) { + final KNNList<D> neighbors = knnRefer.getKNNForDBID(iter, krefer); + double sum = 0.0; + int count = 0; + for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + // skip the point itself + if (objectIsInKNN || !DBIDUtil.equal(neighbor, iter)) { + sum += lrds.doubleValue(neighbor); + count++; + } + } + lof = sum / (count * lrdp); + } else { + lof = 1.0; + } + lofs.putDouble(iter, lof); + // update minimum and maximum + if (!Double.isInfinite(lof)) { + lofminmax.put(lof); + } + + if (progressLOFs != null) { + progressLOFs.incrementProcessed(LOG); + } + } + if (progressLOFs != null) { + progressLOFs.ensureCompleted(LOG); + } + return new Pair<>(lofs, lofminmax); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + final TypeInformation type; + if (reachabilityDistanceFunction.equals(referenceDistanceFunction)) { + type = reachabilityDistanceFunction.getInputTypeRestriction(); + } else { + type = new CombinedTypeInformation(referenceDistanceFunction.getInputTypeRestriction(), reachabilityDistanceFunction.getInputTypeRestriction()); + } + return TypeUtil.array(type); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Encapsulates information like the neighborhood, the LRD and LOF values of + * the objects during a run of the {@link FlexibleLOF} algorithm. + * + * @author Elke Achtert + */ + public static class LOFResult<O, D extends NumberDistance<D, ?>> { + /** + * The result of the run of the {@link FlexibleLOF} algorithm. + */ + private OutlierResult result; + + /** + * The kNN query w.r.t. the reference neighborhood distance. + */ + private final KNNQuery<O, D> kNNRefer; + + /** + * The kNN query w.r.t. the reachability distance. + */ + private final KNNQuery<O, D> kNNReach; + + /** + * The RkNN query w.r.t. the reference neighborhood distance. + */ + private RKNNQuery<O, D> rkNNRefer; + + /** + * The rkNN query w.r.t. the reachability distance. + */ + private RKNNQuery<O, D> rkNNReach; + + /** + * The LRD values of the objects. + */ + private final WritableDoubleDataStore lrds; + + /** + * The LOF values of the objects. + */ + private final WritableDoubleDataStore lofs; + + /** + * Encapsulates information generated during a run of the + * {@link FlexibleLOF} algorithm. + * + * @param result the result of the run of the {@link FlexibleLOF} algorithm + * @param kNNRefer the kNN query w.r.t. the reference neighborhood distance + * @param kNNReach the kNN query w.r.t. the reachability distance + * @param lrds the LRD values of the objects + * @param lofs the LOF values of the objects + */ + public LOFResult(OutlierResult result, KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, WritableDoubleDataStore lrds, WritableDoubleDataStore lofs) { + this.result = result; + this.kNNRefer = kNNRefer; + this.kNNReach = kNNReach; + this.lrds = lrds; + this.lofs = lofs; + } + + /** + * Get the knn query for the reference set. + * + * @return the kNN query w.r.t. the reference neighborhood distance + */ + public KNNQuery<O, D> getKNNRefer() { + return kNNRefer; + } + + /** + * Get the knn query for the reachability set. + * + * @return the kNN query w.r.t. the reachability distance + */ + public KNNQuery<O, D> getKNNReach() { + return kNNReach; + } + + /** + * Get the LRD data store. + * + * @return the LRD values of the objects + */ + public WritableDoubleDataStore getLrds() { + return lrds; + } + + /** + * Get the LOF data store. + * + * @return the LOF values of the objects + */ + public WritableDoubleDataStore getLofs() { + return lofs; + } + + /** + * Get the outlier result. + * + * @return the result of the run of the {@link FlexibleLOF} algorithm + */ + public OutlierResult getResult() { + return result; + } + + /** + * Sets the RkNN query w.r.t. the reference neighborhood distance. + * + * @param rkNNRefer the query to set + */ + public void setRkNNRefer(RKNNQuery<O, D> rkNNRefer) { + this.rkNNRefer = rkNNRefer; + } + + /** + * Get the RkNN query for the reference set. + * + * @return the RkNN query w.r.t. the reference neighborhood distance + */ + public RKNNQuery<O, D> getRkNNRefer() { + return rkNNRefer; + } + + /** + * Get the RkNN query for the reachability set. + * + * @return the RkNN query w.r.t. the reachability distance + */ + public RKNNQuery<O, D> getRkNNReach() { + return rkNNReach; + } + + /** + * Sets the RkNN query w.r.t. the reachability distance. + * + * @param rkNNReach the query to set + */ + public void setRkNNReach(RKNNQuery<O, D> rkNNReach) { + this.rkNNReach = rkNNReach; + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * The distance function to determine the reachability distance between + * database objects. + */ + public static final OptionID REACHABILITY_DISTANCE_FUNCTION_ID = new OptionID("lof.reachdistfunction", "Distance function to determine the reachability distance between database objects."); + + /** + * Parameter to specify the number of nearest neighbors of an object to be + * considered for computing its LOF_SCORE, must be an integer greater than + * 1. + */ + public static final OptionID KREF_ID = new OptionID("lof.krefer", "The number of nearest neighbors of an object to be considered for computing its LOF_SCORE."); + + /** + * Parameter to specify the number of nearest neighbors of an object to be + * considered for computing its reachability distance. + */ + public static final OptionID KREACH_ID = new OptionID("lof.kreach", "The number of nearest neighbors of an object to be considered for computing its LOF_SCORE."); + + /** + * The reference set size to use. + */ + protected int krefer = 2; + + /** + * The set size to use for reachability distance. + */ + protected int kreach = 2; + + /** + * Neighborhood distance function. + */ + protected DistanceFunction<O, D> neighborhoodDistanceFunction = null; + + /** + * Reachability distance function. + */ + protected DistanceFunction<O, D> reachabilityDistanceFunction = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + final IntParameter pK = new IntParameter(KREF_ID); + pK.addConstraint(new GreaterConstraint(1)); + if (config.grab(pK)) { + krefer = pK.intValue(); + } + + final IntParameter pK2 = new IntParameter(KREACH_ID); + pK2.setOptional(true); + pK2.addConstraint(new GreaterConstraint(1)); + if (config.grab(pK2)) { + kreach = pK2.intValue(); + } else { + kreach = krefer; + } + + final ObjectParameter<DistanceFunction<O, D>> reachDistP = new ObjectParameter<>(REACHABILITY_DISTANCE_FUNCTION_ID, DistanceFunction.class); + reachDistP.setOptional(true); + if (config.grab(reachDistP)) { + reachabilityDistanceFunction = reachDistP.instantiateClass(config); + } else { + reachabilityDistanceFunction = distanceFunction; + } + } + + @Override + protected FlexibleLOF<O, D> makeInstance() { + return new FlexibleLOF<>(kreach, krefer, distanceFunction, reachabilityDistanceFunction); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java new file mode 100644 index 00000000..ae297a3c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java @@ -0,0 +1,261 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * INFLO provides the Mining Algorithms (Two-way Search Method) for Influence
+ * Outliers using Symmetric Relationship
+ * <p>
+ * Reference: <br>
+ * <p>
+ * Jin, W., Tung, A., Han, J., and Wang, W. 2006<br/>
+ * Ranking outliers using symmetric neighborhood relationship<br/>
+ * In Proc. Pacific-Asia Conf. on Knowledge Discovery and Data Mining (PAKDD),
+ * Singapore
+ * </p>
+ *
+ * @author Ahmed Hettab
+ *
+ * @apiviz.has KNNQuery
+ *
+ * @param <O> the type of DatabaseObject the algorithm is applied on
+ */
+@Title("INFLO: Influenced Outlierness Factor")
+@Description("Ranking Outliers Using Symmetric Neigborhood Relationship")
+@Reference(authors = "Jin, W., Tung, A., Han, J., and Wang, W", title = "Ranking outliers using symmetric neighborhood relationship", booktitle = "Proc. Pacific-Asia Conf. on Knowledge Discovery and Data Mining (PAKDD), Singapore, 2006", url = "http://dx.doi.org/10.1007/11731139_68")
+public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(INFLO.class);
+
+ /**
+ * Parameter to specify if any object is a Core Object must be a double
+ * greater than 0.0
+ * <p>
+ * see paper "Two-way search method" 3.2
+ */
+ public static final OptionID M_ID = new OptionID("inflo.m", "The threshold");
+
+ /**
+ * Holds the value of {@link #M_ID}.
+ */
+ private double m;
+
+ /**
+ * Parameter to specify the number of nearest neighbors of an object to be
+ * considered for computing its INFLO_SCORE. must be an integer greater than
+ * 1.
+ */
+ public static final OptionID K_ID = new OptionID("inflo.k", "The number of nearest neighbors of an object to be considered for computing its INFLO_SCORE.");
+
+ /**
+ * Holds the value of {@link #K_ID}.
+ */
+ private int k;
+
+ /**
+ * Constructor with parameters.
+ *
+ * @param distanceFunction Distance function in use
+ * @param m m Parameter
+ * @param k k Parameter
+ */
+ public INFLO(DistanceFunction<? super O, D> distanceFunction, double m, int k) {
+ super(distanceFunction);
+ this.m = m;
+ this.k = k;
+ }
+
+ /**
+ * Run the algorithm
+ *
+ * @param database Database to process
+ * @param relation Relation to process
+ * @return Outlier result
+ */
+ public OutlierResult run(Database database, Relation<O> relation) {
+ DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
+
+ ModifiableDBIDs processedIDs = DBIDUtil.newHashSet(relation.size());
+ ModifiableDBIDs pruned = DBIDUtil.newHashSet();
+ // KNNS
+ WritableDataStore<ModifiableDBIDs> knns = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, ModifiableDBIDs.class);
+ // RNNS
+ WritableDataStore<ModifiableDBIDs> rnns = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, ModifiableDBIDs.class);
+ // density
+ WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
+ // init knns and rnns
+ for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ knns.put(iditer, DBIDUtil.newArray());
+ rnns.put(iditer, DBIDUtil.newArray());
+ }
+
+ // TODO: use kNN preprocessor?
+ KNNQuery<O, D> knnQuery = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE);
+
+ for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
+ // if not visited count=0
+ int count = rnns.get(id).size();
+ if (!processedIDs.contains(id)) {
+ // TODO: use exactly k neighbors?
+ KNNList<D> list = knnQuery.getKNNForDBID(id, k);
+ knns.get(id).addDBIDs(list);
+ processedIDs.add(id);
+ density.putDouble(id, 1 / list.getKNNDistance().doubleValue());
+
+ }
+ ModifiableDBIDs s = knns.get(id);
+ for (DBIDIter q = knns.get(id).iter(); q.valid(); q.advance()) {
+ if (!processedIDs.contains(q)) {
+ // TODO: use exactly k neighbors?
+ KNNList<D> listQ = knnQuery.getKNNForDBID(q, k);
+ knns.get(q).addDBIDs(listQ);
+ density.putDouble(q, 1 / listQ.getKNNDistance().doubleValue());
+ processedIDs.add(q);
+ }
+
+ if (knns.get(q).contains(id)) {
+ rnns.get(q).add(id);
+ rnns.get(id).add(q);
+ count++;
+ }
+ }
+ if (count >= s.size() * m) {
+ pruned.add(id);
+ }
+ }
+
+ // Calculate INFLO for any Object
+ // IF Object is pruned INFLO=1.0
+ DoubleMinMax inflominmax = new DoubleMinMax();
+ WritableDoubleDataStore inflos = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
+ for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
+ if (!pruned.contains(id)) {
+ ModifiableDBIDs knn = knns.get(id);
+ ModifiableDBIDs rnn = rnns.get(id);
+
+ double denP = density.doubleValue(id);
+ knn.addDBIDs(rnn);
+ Mean mean = new Mean();
+ for (DBIDIter iter = knn.iter(); iter.valid(); iter.advance()) {
+ mean.put(density.doubleValue(iter));
+ }
+ double den = mean.getMean() / denP;
+ inflos.putDouble(id, den);
+ // update minimum and maximum
+ inflominmax.put(den);
+
+ }
+ if (pruned.contains(id)) {
+ inflos.putDouble(id, 1.0);
+ inflominmax.put(1.0);
+ }
+ }
+
+ // Build result representation.
+ Relation<Double> scoreResult = new MaterializedRelation<>("Influence Outlier Score", "inflo-outlier", TypeUtil.DOUBLE, inflos, relation.getDBIDs());
+ OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(inflominmax.getMin(), inflominmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
+ return new OutlierResult(scoreMeta, scoreResult);
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ protected double m = 1.0;
+
+ protected int k = 0;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final DoubleParameter mP = new DoubleParameter(M_ID, 1.0);
+ mP.addConstraint(new GreaterConstraint(0.0));
+ if (config.grab(mP)) {
+ m = mP.doubleValue();
+ }
+
+ final IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(new GreaterConstraint(1));
+ if (config.grab(kP)) {
+ k = kP.intValue();
+ }
+ }
+
+ @Override
+ protected INFLO<O, D> makeInstance() {
+ return new INFLO<>(distanceFunction, m, k);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java new file mode 100644 index 00000000..4a86e93d --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java @@ -0,0 +1,356 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceKNNList; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; +import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.GaussianKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Outlier Detection with Kernel Density Functions. + * + * A variation of LOF which uses kernel density estimation, but in contrast to + * {@link SimpleKernelDensityLOF} also uses the reachability concept of LOF. + * + * Reference: + * <p> + * Outlier Detection with Kernel Density Functions.<br/> + * L. J. Latecki, A. Lazarevic, D. Pokrajac<br /> + * Machine Learning and Data Mining in Pattern Recognition 2007 + * </p> + * + * @author Erich Schubert + * + * @apiviz.has KNNQuery + * @apiviz.has KernelDensityFunction + * + * @param <O> the type of objects handled by this Algorithm + * @param <D> Distance type + */ +@Reference(authors = "L. J. Latecki, A. Lazarevic, D. Pokrajac", title = "Outlier Detection with Kernel Density Functions", booktitle = "Machine Learning and Data Mining in Pattern Recognition", url = "http://dx.doi.org/10.1007/978-3-540-73499-4_6") +public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(LDF.class); + + /** + * Parameter k. + */ + protected int k; + + /** + * Bandwidth scaling factor. + */ + protected double h = 1; + + /** + * Scaling constant, to limit value range to 1/c + */ + protected double c = 0.1; + + /** + * Kernel density function + */ + private KernelDensityFunction kernel; + + /** + * Constructor. + * + * @param k the value of k + * @param kernel Kernel function + * @param h Kernel bandwidth scaling + * @param c Score scaling parameter + */ + public LDF(int k, DistanceFunction<? super O, D> distance, KernelDensityFunction kernel, double h, double c) { + super(distance); + this.k = k + 1; + this.kernel = kernel; + this.h = h; + this.c = c; + } + + /** + * Run the naive kernel density LOF algorithm. + * + * @param database Database to query + * @param relation Data to process + * @return LOF outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + StepProgress stepprog = LOG.isVerbose() ? new StepProgress("LDF", 3) : null; + + final int dim = RelationUtil.dimensionality(relation); + + DBIDs ids = relation.getDBIDs(); + + // "HEAVY" flag for KNN Query since it is used more than once + KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); + // No optimized kNN query - use a preprocessor! + if (!(knnq instanceof PreprocessorKNNQuery)) { + if (stepprog != null) { + stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG); + } + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); + database.addIndex(preproc); + DistanceQuery<O, D> rdq = database.getDistanceQuery(relation, getDistanceFunction()); + knnq = preproc.getKNNQuery(rdq, k); + } + + // Compute LDEs + if (stepprog != null) { + stepprog.beginStep(2, "Computing LDEs.", LOG); + } + WritableDoubleDataStore ldes = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null; + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); + double sum = 0.0; + int count = 0; + if (neighbors instanceof DoubleDistanceKNNList) { + // Fast version for double distances + for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + final double nkdist = ((DoubleDistanceKNNList) knnq.getKNNForDBID(neighbor, k)).doubleKNNDistance(); + if (nkdist > 0.) { + final double v = Math.max(nkdist, neighbor.doubleDistance()) / (h * nkdist); + sum += kernel.density(v) / Math.pow(h * nkdist, dim); + count++; + } else { + sum = Double.POSITIVE_INFINITY; + count++; + break; + } + } + } else { + for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + final double nkdist = knnq.getKNNForDBID(neighbor, k).getKNNDistance().doubleValue(); + if (nkdist > 0.) { + final double v = Math.max(nkdist, neighbor.getDistance().doubleValue()) / (h * nkdist); + sum += kernel.density(v) / Math.pow(h * nkdist, dim); + count++; + } else { + sum = Double.POSITIVE_INFINITY; + count++; + break; + } + } + } + ldes.putDouble(it, sum / count); + if (densProgress != null) { + densProgress.incrementProcessed(LOG); + } + } + if (densProgress != null) { + densProgress.ensureCompleted(LOG); + } + + // Compute local density factors. + if (stepprog != null) { + stepprog.beginStep(3, "Computing LDFs.", LOG); + } + WritableDoubleDataStore ldfs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + // track the maximum value for normalization. + DoubleMinMax lofminmax = new DoubleMinMax(); + + FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("Local Density Factors", ids.size(), LOG) : null; + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + final double lrdp = ldes.doubleValue(it); + final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); + double sum = 0.0; + int count = 0; + for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + // skip the point itself + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + sum += ldes.doubleValue(neighbor); + count++; + } + sum /= count; + final double div = lrdp + c * sum; + double ldf = (div > 0) ? sum / div : 0; + ldfs.putDouble(it, ldf); + // update minimum and maximum + lofminmax.put(ldf); + + if (progressLOFs != null) { + progressLOFs.incrementProcessed(LOG); + } + } + if (progressLOFs != null) { + progressLOFs.ensureCompleted(LOG); + } + + if (stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Local Density Factor", "ldf-outlier", TypeUtil.DOUBLE, ldfs, ids); + OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, 1. / c, 1 / (1 + c)); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(new CombinedTypeInformation(getDistanceFunction().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD)); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> vector type + * @param <D> distance type + */ + public static class Parameterizer<O extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Option ID for kernel. + */ + public static final OptionID KERNEL_ID = new OptionID("ldf.kernel", "Kernel to use for LDF."); + + /** + * Option ID for k + */ + public static final OptionID K_ID = new OptionID("ldf.k", "Number of neighbors to use for LDF."); + + /** + * Option ID for h - kernel bandwidth scaling + */ + public static final OptionID H_ID = new OptionID("ldf.h", "Kernel bandwidth multiplier for LDF."); + + /** + * Option ID for c + */ + public static final OptionID C_ID = new OptionID("ldf.c", "Score scaling parameter for LDF."); + + /** + * The neighborhood size to use. + */ + protected int k = 2; + + /** + * Kernel density function parameter + */ + KernelDensityFunction kernel; + + /** + * Bandwidth scaling factor. + */ + protected double h = 1; + + /** + * Scaling constant, to limit value range to 1/c + */ + protected double c = 0.1; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + final IntParameter pK = new IntParameter(K_ID); + pK.addConstraint(new GreaterConstraint(1)); + if (config.grab(pK)) { + k = pK.getValue(); + } + + ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, GaussianKernelDensityFunction.class); + if (config.grab(kernelP)) { + kernel = kernelP.instantiateClass(config); + } + + DoubleParameter hP = new DoubleParameter(H_ID); + if (config.grab(hP)) { + h = hP.doubleValue(); + } + + DoubleParameter cP = new DoubleParameter(C_ID, 0.1); + if (config.grab(cP)) { + c = cP.doubleValue(); + } + } + + @Override + protected LDF<O, D> makeInstance() { + return new LDF<>(k, distanceFunction, kernel, h, c); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java new file mode 100644 index 00000000..80ed3f68 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java @@ -0,0 +1,213 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2011
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.Alias;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * <p>
+ * Computes the LDOF (Local Distance-Based Outlier Factor) for all objects of a
+ * Database.
+ * </p>
+ *
+ * <p>
+ * Reference:<br>
+ * K. Zhang, M. Hutter, H. Jin: A New Local Distance-Based Outlier Detection
+ * Approach for Scattered Real-World Data.<br>
+ * In: Proc. 13th Pacific-Asia Conference on Advances in Knowledge Discovery and
+ * Data Mining (PAKDD 2009), Bangkok, Thailand, 2009.
+ * </p>
+ *
+ * @author Arthur Zimek
+ *
+ * @apiviz.has KNNQuery
+ *
+ * @param <O> the type of DatabaseObjects handled by this Algorithm
+ */
+@Title("LDOF: Local Distance-Based Outlier Factor")
+@Description("Local outlier detection appraoch suitable for scattered data by averaging the kNN distance over all k nearest neighbors")
+@Reference(authors = "K. Zhang, M. Hutter, H. Jin", title = "A New Local Distance-Based Outlier Detection Approach for Scattered Real-World Data", booktitle = "Proc. 13th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2009), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2_84")
+@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LDOF"})
+public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(LDOF.class);
+
+ /**
+ * Parameter to specify the number of nearest neighbors of an object to be
+ * considered for computing its LDOF_SCORE, must be an integer greater than 1.
+ */
+ public static final OptionID K_ID = new OptionID("ldof.k", "The number of nearest neighbors of an object to be considered for computing its LDOF_SCORE.");
+
+ /**
+ * The baseline for LDOF values. The paper gives 0.5 for uniform
+ * distributions, although one might also discuss using 1.0 as baseline.
+ */
+ private static final double LDOF_BASELINE = 0.5;
+
+ /**
+ * Holds the value of {@link #K_ID}.
+ */
+ int k;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k Parameter
+ */
+ public LDOF(DistanceFunction<? super O, D> distanceFunction, int k) {
+ super(distanceFunction);
+ this.k = k;
+ }
+
+ /**
+ * Run the algorithm
+ *
+ * @param database Database to process
+ * @param relation Relation to process
+ * @return Outlier result
+ */
+ public OutlierResult run(Database database, Relation<O> relation) {
+ DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
+ KNNQuery<O, D> knnQuery = database.getKNNQuery(distFunc, k);
+
+ // track the maximum value for normalization
+ DoubleMinMax ldofminmax = new DoubleMinMax();
+ // compute the ldof values
+ WritableDoubleDataStore ldofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+
+ // compute LOF_SCORE of each db object
+ if(LOG.isVerbose()) {
+ LOG.verbose("Computing LDOFs");
+ }
+ FiniteProgress progressLDOFs = LOG.isVerbose() ? new FiniteProgress("LDOF_SCORE for objects", relation.size(), LOG) : null;
+
+ Mean dxp = new Mean(), Dxp = new Mean();
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + KNNList<D> neighbors = knnQuery.getKNNForDBID(iditer, k);
+ // skip the point itself
+ dxp.reset(); Dxp.reset();
+ // TODO: optimize for double distances
+ for (DistanceDBIDListIter<D> neighbor1 = neighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
+ if(!DBIDUtil.equal(neighbor1, iditer)) {
+ dxp.put(neighbor1.getDistance().doubleValue());
+ for (DistanceDBIDListIter<D> neighbor2 = neighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
+ if(!DBIDUtil.equal(neighbor1, neighbor2) && !DBIDUtil.equal(neighbor2, iditer)) {
+ Dxp.put(distFunc.distance(neighbor1, neighbor2).doubleValue());
+ }
+ }
+ }
+ }
+ double ldof = dxp.getMean() / Dxp.getMean();
+ if(Double.isNaN(ldof) || Double.isInfinite(ldof)) {
+ ldof = 1.0;
+ }
+ ldofs.putDouble(iditer, ldof);
+ // update maximum
+ ldofminmax.put(ldof);
+
+ if(progressLDOFs != null) {
+ progressLDOFs.incrementProcessed(LOG);
+ }
+ }
+ if(progressLDOFs != null) {
+ progressLDOFs.ensureCompleted(LOG);
+ }
+
+ // Build result representation.
+ Relation<Double> scoreResult = new MaterializedRelation<>("LDOF Outlier Score", "ldof-outlier", TypeUtil.DOUBLE, ldofs, relation.getDBIDs());
+ OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(ldofminmax.getMin(), ldofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, LDOF_BASELINE);
+ return new OutlierResult(scoreMeta, scoreResult);
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ protected int k = 0;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(new GreaterConstraint(1));
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+ }
+
+ @Override
+ protected LDOF<O, D> makeInstance() {
+ return new LDOF<>(distanceFunction, k);
+ }
+ }
+}
\ No newline at end of file diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java new file mode 100644 index 00000000..e76c6034 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOCI.java @@ -0,0 +1,340 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; + +/** + * Fast Outlier Detection Using the "Local Correlation Integral". + * + * Exact implementation only, not aLOCI. See {@link ALOCI} + * + * Outlier detection using multiple epsilon neighborhoods. + * + * Based on: S. Papadimitriou, H. Kitagawa, P. B. Gibbons and C. Faloutsos: + * LOCI: Fast Outlier Detection Using the Local Correlation Integral. In: Proc. + * 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003. + * + * @author Erich Schubert + * + * @apiviz.has RangeQuery + * + * @param <O> Object type + * @param <D> Distance type + */ +@Title("LOCI: Fast Outlier Detection Using the Local Correlation Integral") +@Description("Algorithm to compute outliers based on the Local Correlation Integral") +@Reference(authors = "S. Papadimitriou, H. Kitagawa, P. B. Gibbons, C. Faloutsos", title = "LOCI: Fast Outlier Detection Using the Local Correlation Integral", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDE.2003.1260802") +@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LOCI"}) +public class LOCI<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(LOCI.class); + + /** + * Parameter to specify the maximum radius of the neighborhood to be + * considered, must be suitable to the distance function specified. + */ + public static final OptionID RMAX_ID = new OptionID("loci.rmax", "The maximum radius of the neighborhood to be considered."); + + /** + * Parameter to specify the minimum neighborhood size + */ + public static final OptionID NMIN_ID = new OptionID("loci.nmin", "Minimum neighborhood size to be considered."); + + /** + * Parameter to specify the averaging neighborhood scaling. + */ + public static final OptionID ALPHA_ID = new OptionID("loci.alpha", "Scaling factor for averaging neighborhood"); + + /** + * Holds the value of {@link #RMAX_ID}. + */ + private D rmax; + + /** + * Holds the value of {@link #NMIN_ID}. + */ + private int nmin; + + /** + * Holds the value of {@link #ALPHA_ID}. + */ + private double alpha; + + /** + * Constructor. + * + * @param distanceFunction Distance function + * @param rmax Maximum radius + * @param nmin Minimum neighborhood size + * @param alpha Alpha value + */ + public LOCI(DistanceFunction<? super O, D> distanceFunction, D rmax, int nmin, double alpha) { + super(distanceFunction); + this.rmax = rmax; + this.nmin = nmin; + this.alpha = alpha; + } + + /** + * Run the algorithm + * + * @param database Database to process + * @param relation Relation to process + * @return Outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); + RangeQuery<O, D> rangeQuery = database.getRangeQuery(distFunc); + + FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("LOCI preprocessing", relation.size(), LOG) : null; + // LOCI preprocessing step + WritableDataStore<ArrayList<DoubleIntPair>> interestingDistances = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_SORTED, ArrayList.class); + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(iditer, rmax); + // build list of critical distances + ArrayList<DoubleIntPair> cdist = new ArrayList<>(neighbors.size() << 1); + { + for(int i = 0; i < neighbors.size(); i++) { + DistanceDBIDPair<D> r = neighbors.get(i); + if(i + 1 < neighbors.size() && r.getDistance().compareTo(neighbors.get(i + 1).getDistance()) == 0) { + continue; + } + cdist.add(new DoubleIntPair(r.getDistance().doubleValue(), i)); + final double ri = r.getDistance().doubleValue() / alpha; + if(ri <= rmax.doubleValue()) { + cdist.add(new DoubleIntPair(ri, Integer.MIN_VALUE)); + } + } + } + Collections.sort(cdist); + // fill the gaps to have fast lookups of number of neighbors at a given + // distance. + int lastk = 0; + for(DoubleIntPair c : cdist) { + if(c.second == Integer.MIN_VALUE) { + c.second = lastk; + } + else { + lastk = c.second; + } + } + + interestingDistances.put(iditer, cdist); + if(progressPreproc != null) { + progressPreproc.incrementProcessed(LOG); + } + } + if(progressPreproc != null) { + progressPreproc.ensureCompleted(LOG); + } + // LOCI main step + FiniteProgress progressLOCI = LOG.isVerbose() ? new FiniteProgress("LOCI scores", relation.size(), LOG) : null; + WritableDoubleDataStore mdef_norm = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + WritableDoubleDataStore mdef_radius = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + DoubleMinMax minmax = new DoubleMinMax(); + + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final List<DoubleIntPair> cdist = interestingDistances.get(iditer); + final double maxdist = cdist.get(cdist.size() - 1).first; + final int maxneig = cdist.get(cdist.size() - 1).second; + + double maxmdefnorm = 0.0; + double maxnormr = 0; + if(maxneig >= nmin) { + D range = distFunc.getDistanceFactory().fromDouble(maxdist); + // Compute the largest neighborhood we will need. + DistanceDBIDList<D> maxneighbors = rangeQuery.getRangeForDBID(iditer, range); + // TODO: Ensure the set is sorted. Should be a no-op with most indexes. + // For any critical distance, compute the normalized MDEF score. + for(DoubleIntPair c : cdist) { + // Only start when minimum size is fulfilled + if (c.second < nmin) { + continue; + } + final double r = c.first; + final double alpha_r = alpha * r; + // compute n(p_i, \alpha * r) from list (note: alpha_r is different from c!) + final int n_alphar = elementsAtRadius(cdist, alpha_r); + // compute \hat{n}(p_i, r, \alpha) and the corresponding \simga_{MDEF} + MeanVariance mv_n_r_alpha = new MeanVariance(); + // TODO: optimize for double distances + for (DistanceDBIDListIter<D> neighbor = maxneighbors.iter(); neighbor.valid(); neighbor.advance()) { + // Stop at radius r + if(neighbor.getDistance().doubleValue() > r) { + break; + } + int rn_alphar = elementsAtRadius(interestingDistances.get(neighbor), alpha_r); + mv_n_r_alpha.put(rn_alphar); + } + // We only use the average and standard deviation + final double nhat_r_alpha = mv_n_r_alpha.getMean(); + final double sigma_nhat_r_alpha = mv_n_r_alpha.getNaiveStddev(); + + // Redundant divisions removed. + final double mdef = (nhat_r_alpha - n_alphar); // / nhat_r_alpha; + final double sigmamdef = sigma_nhat_r_alpha; // / nhat_r_alpha; + final double mdefnorm = mdef / sigmamdef; + + if(mdefnorm > maxmdefnorm) { + maxmdefnorm = mdefnorm; + maxnormr = r; + } + } + } + else { + // FIXME: when nmin was not fulfilled - what is the proper value then? + maxmdefnorm = 1.0; + maxnormr = maxdist; + } + mdef_norm.putDouble(iditer, maxmdefnorm); + mdef_radius.putDouble(iditer, maxnormr); + minmax.put(maxmdefnorm); + if(progressLOCI != null) { + progressLOCI.incrementProcessed(LOG); + } + } + if(progressLOCI != null) { + progressLOCI.ensureCompleted(LOG); + } + Relation<Double> scoreResult = new MaterializedRelation<>("LOCI normalized MDEF", "loci-mdef-outlier", TypeUtil.DOUBLE, mdef_norm, relation.getDBIDs()); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0.0); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + result.addChildResult(new MaterializedRelation<>("LOCI MDEF Radius", "loci-critical-radius", TypeUtil.DOUBLE, mdef_radius, relation.getDBIDs())); + return result; + } + + /** + * Get the number of objects for a given radius, from the list of critical + * distances, storing (radius, count) pairs. + * + * @param criticalDistances + * @param radius + * @return Number of elements at the given radius + */ + protected int elementsAtRadius(List<DoubleIntPair> criticalDistances, final double radius) { + int n_r = 0; + for(DoubleIntPair c2 : criticalDistances) { + if(c2.first > radius) { + break; + } + if(c2.second != Integer.MIN_VALUE) { + // Update + n_r = c2.second; + } + } + return n_r; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + protected D rmax = null; + + protected int nmin = 0; + + protected double alpha = 0.5; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final D distanceFactory = (distanceFunction != null) ? distanceFunction.getDistanceFactory() : null; + final DistanceParameter<D> rmaxP = new DistanceParameter<>(RMAX_ID, distanceFactory); + if(config.grab(rmaxP)) { + rmax = rmaxP.getValue(); + } + + final IntParameter nminP = new IntParameter(NMIN_ID, 20); + if(config.grab(nminP)) { + nmin = nminP.getValue(); + } + + final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5); + if(config.grab(alphaP)) { + alpha = alphaP.getValue(); + } + } + + @Override + protected LOCI<O, D> makeInstance() { + return new LOCI<>(distanceFunction, rmax, nmin, alpha); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java new file mode 100644 index 00000000..302dafe6 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java @@ -0,0 +1,293 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceKNNList; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; +import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * <p> + * Algorithm to compute density-based local outlier factors in a database based + * on a specified parameter {@link Parameterizer#K_ID} ({@code -lof.k}). + * </p> + * + * <p> + * The original LOF parameter was called "minPts", but for consistency + * within ELKI we have renamed this parameter to "k". + * </p> + * + * <p> + * Reference: <br> + * M. M. Breunig, H.-P. Kriegel, R. Ng, J. Sander: LOF: Identifying + * Density-Based Local Outliers. <br> + * In: Proc. 2nd ACM SIGMOD Int. Conf. on Management of Data (SIGMOD'00), + * Dallas, TX, 2000. + * </p> + * + * @author Erich Schubert + * @author Elke Achtert + * + * @apiviz.has KNNQuery + * + * @param <O> the type of DatabaseObjects handled by this Algorithm + * @param <D> Distance type + */ +@Title("LOF: Local Outlier Factor") +@Description("Algorithm to compute density-based local outlier factors in a database based on the neighborhood size parameter 'k'") +@Reference(authors = "M. M. Breunig, H.-P. Kriegel, R. Ng, and J. Sander", title = "LOF: Identifying Density-Based Local Outliers", booktitle = "Proc. 2nd ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '00), Dallas, TX, 2000", url = "http://dx.doi.org/10.1145/342009.335388") +@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LOF", "outlier.LOF", "LOF" }) +public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(LOF.class); + + /** + * Holds the value of {@link Parameterizer#K_ID}. + */ + protected int k = 2; + + /** + * Constructor. + * + * @param k the value of k + * @param distanceFunction the neighborhood distance function + */ + public LOF(int k, DistanceFunction<? super O, D> distanceFunction) { + super(distanceFunction); + this.k = k + 1; + } + + /** + * Performs the Generalized LOF_SCORE algorithm on the given database. + * + * @param database Database to query + * @param relation Data to process + * @return LOF outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + StepProgress stepprog = LOG.isVerbose() ? new StepProgress("LOF", 3) : null; + DistanceQuery<O, D> dq = database.getDistanceQuery(relation, getDistanceFunction()); + // "HEAVY" flag for knn query since it is used more than once + KNNQuery<O, D> knnq = database.getKNNQuery(dq, k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); + // No optimized kNN query - use a preprocessor! + if (!(knnq instanceof PreprocessorKNNQuery)) { + if (stepprog != null) { + stepprog.beginStep(1, "Materializing LOF neighborhoods.", LOG); + } + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); + knnq = preproc.getKNNQuery(dq, k); + } + DBIDs ids = relation.getDBIDs(); + + // Compute LRDs + if (stepprog != null) { + stepprog.beginStep(2, "Computing LRDs.", LOG); + } + WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + { + FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("LRD", ids.size(), LOG) : null; + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); + double sum = 0.0; + int count = 0; + if (neighbors instanceof DoubleDistanceKNNList) { + // Fast version for double distances + for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, iter)) { + continue; + } + KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k); + final double nkdist; + if (neighborsNeighbors instanceof DoubleDistanceKNNList) { + nkdist = ((DoubleDistanceKNNList) neighborsNeighbors).doubleKNNDistance(); + } else { + nkdist = neighborsNeighbors.getKNNDistance().doubleValue(); + } + sum += Math.max(neighbor.doubleDistance(), nkdist); + count++; + } + } else { + for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, iter)) { + continue; + } + KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k); + sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue()); + count++; + } + } + // Avoid division by 0 + final double lrd = (sum > 0) ? (count / sum) : Double.POSITIVE_INFINITY; + lrds.putDouble(iter, lrd); + if (lrdsProgress != null) { + lrdsProgress.incrementProcessed(LOG); + } + } + if (lrdsProgress != null) { + lrdsProgress.ensureCompleted(LOG); + } + } + + // compute LOF_SCORE of each db object + if (stepprog != null) { + stepprog.beginStep(3, "Computing LOFs.", LOG); + } + WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + // track the maximum value for normalization. + DoubleMinMax lofminmax = new DoubleMinMax(); + { + FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), LOG) : null; + for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { + final double lof; + final double lrdp = lrds.doubleValue(iter); + final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k); + if (!Double.isInfinite(lrdp)) { + double sum = 0.0; + int count = 0; + for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + // skip the point itself + if (DBIDUtil.equal(neighbor, iter)) { + continue; + } + final double val = lrds.doubleValue(neighbor); + sum += val; + count++; + if (Double.isInfinite(val)) { + break; + } + } + lof = sum / (lrdp * count); + } else { + lof = 1.0; + } + lofs.putDouble(iter, lof); + // update minimum and maximum + lofminmax.put(lof); + + if (progressLOFs != null) { + progressLOFs.incrementProcessed(LOG); + } + } + if (progressLOFs != null) { + progressLOFs.ensureCompleted(LOG); + } + } + + if (stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Parameter to specify the number of nearest neighbors of an object to be + * considered for computing its LOF_SCORE, must be an integer greater than + * 1. + */ + public static final OptionID K_ID = new OptionID("lof.k", "The number of nearest neighbors of an object to be considered for computing its LOF_SCORE."); + + /** + * The neighborhood size to use. + */ + protected int k = 2; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + final IntParameter pK = new IntParameter(K_ID); + pK.addConstraint(new GreaterConstraint(1)); + if (config.grab(pK)) { + k = pK.getValue(); + } + } + + @Override + protected LOF<O, D> makeInstance() { + return new LOF<>(k, distanceFunction); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java new file mode 100644 index 00000000..15ff690a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java @@ -0,0 +1,442 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceKNNList; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; +import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.math.MeanVariance; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; + +/** + * LoOP: Local Outlier Probabilities + * + * Distance/density based algorithm similar to LOF to detect outliers, but with + * statistical methods to achieve better result stability. + * + * @author Erich Schubert + * + * @apiviz.has KNNQuery + * + * @param <O> type of objects handled by this algorithm + * @param <D> type of distances used + */ +@Title("LoOP: Local Outlier Probabilities") +@Description("Variant of the LOF algorithm normalized using statistical values.") +@Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "LoOP: Local Outlier Probabilities", booktitle = "Proceedings of the 18th International Conference on Information and Knowledge Management (CIKM), Hong Kong, China, 2009", url = "http://dx.doi.org/10.1145/1645953.1646195") +@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LoOP", "LoOP", "outlier.LoOP" }) +public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(LoOP.class); + + /** + * The distance function to determine the reachability distance between + * database objects. + */ + public static final OptionID REACHABILITY_DISTANCE_FUNCTION_ID = new OptionID("loop.referencedistfunction", "Distance function to determine the density of an object."); + + /** + * The distance function to determine the reachability distance between + * database objects. + */ + public static final OptionID COMPARISON_DISTANCE_FUNCTION_ID = new OptionID("loop.comparedistfunction", "Distance function to determine the reference set of an object."); + + /** + * Parameter to specify the number of nearest neighbors of an object to be + * considered for computing its LOOP_SCORE, must be an integer greater than 1. + */ + public static final OptionID KREACH_ID = new OptionID("loop.kref", "The number of nearest neighbors of an object to be used for the PRD value."); + + /** + * Parameter to specify the number of nearest neighbors of an object to be + * considered for computing its LOOP_SCORE, must be an integer greater than 1. + */ + public static final OptionID KCOMP_ID = new OptionID("loop.kcomp", "The number of nearest neighbors of an object to be considered for computing its LOOP_SCORE."); + + /** + * Parameter to specify the number of nearest neighbors of an object to be + * considered for computing its LOOP_SCORE, must be an integer greater than 1. + */ + public static final OptionID LAMBDA_ID = new OptionID("loop.lambda", "The number of standard deviations to consider for density computation."); + + /** + * Holds the value of {@link #KREACH_ID}. + */ + int kreach; + + /** + * Holds the value of {@link #KCOMP_ID}. + */ + int kcomp; + + /** + * Hold the value of {@link #LAMBDA_ID}. + */ + double lambda; + + /** + * Preprocessor Step 1. + */ + protected DistanceFunction<? super O, D> reachabilityDistanceFunction; + + /** + * Preprocessor Step 2. + */ + protected DistanceFunction<? super O, D> comparisonDistanceFunction; + + /** + * Include object itself in kNN neighborhood. + */ + static boolean objectIsInKNN = false; + + /** + * Constructor with parameters. + * + * @param kreach k for reachability + * @param kcomp k for comparison + * @param reachabilityDistanceFunction distance function for reachability + * @param comparisonDistanceFunction distance function for comparison + * @param lambda Lambda parameter + */ + public LoOP(int kreach, int kcomp, DistanceFunction<? super O, D> reachabilityDistanceFunction, DistanceFunction<? super O, D> comparisonDistanceFunction, double lambda) { + super(); + this.kreach = kreach; + this.kcomp = kcomp; + this.reachabilityDistanceFunction = reachabilityDistanceFunction; + this.comparisonDistanceFunction = comparisonDistanceFunction; + this.lambda = lambda; + } + + /** + * Get the kNN queries for the algorithm. + * + * @param database Database to analyze + * @param relation Relation to analyze + * @param stepprog Progress logger, may be {@code null} + * @return result + */ + protected Pair<KNNQuery<O, D>, KNNQuery<O, D>> getKNNQueries(Database database, Relation<O> relation, StepProgress stepprog) { + KNNQuery<O, D> knnComp; + KNNQuery<O, D> knnReach; + if (comparisonDistanceFunction == reachabilityDistanceFunction || comparisonDistanceFunction.equals(reachabilityDistanceFunction)) { + // We need each neighborhood twice - use "HEAVY" flag. + knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, Math.max(kreach, kcomp), DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); + // No optimized kNN query - use a preprocessor! + if (knnComp == null) { + if (stepprog != null) { + stepprog.beginStep(1, "Materializing neighborhoods with respect to reference neighborhood distance function.", LOG); + } + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, comparisonDistanceFunction, kcomp); + database.addIndex(preproc); + DistanceQuery<O, D> cdq = database.getDistanceQuery(relation, comparisonDistanceFunction); + knnComp = preproc.getKNNQuery(cdq, kreach, DatabaseQuery.HINT_HEAVY_USE); + } else { + if (stepprog != null) { + stepprog.beginStep(1, "Optimized neighborhoods provided by database.", LOG); + } + } + knnReach = knnComp; + } else { + if (stepprog != null) { + stepprog.beginStep(1, "Not materializing distance functions, since we request each DBID once only.", LOG); + } + knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, kreach); + knnReach = QueryUtil.getKNNQuery(relation, reachabilityDistanceFunction, kcomp); + } + return new Pair<>(knnComp, knnReach); + } + + /** + * Performs the LoOP algorithm on the given database. + * + * @param database Database to process + * @param relation Relation to process + * @return Outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + final double sqrt2 = Math.sqrt(2.0); + + StepProgress stepprog = LOG.isVerbose() ? new StepProgress(5) : null; + + Pair<KNNQuery<O, D>, KNNQuery<O, D>> pair = getKNNQueries(database, relation, stepprog); + KNNQuery<O, D> knnComp = pair.getFirst(); + KNNQuery<O, D> knnReach = pair.getSecond(); + + // Assert we got something + if (knnComp == null) { + throw new AbortException("No kNN queries supported by database for comparison distance function."); + } + if (knnReach == null) { + throw new AbortException("No kNN queries supported by database for density estimation distance function."); + } + + // Probabilistic distances + WritableDoubleDataStore pdists = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + Mean mean = new Mean(); + {// computing PRDs + if (stepprog != null) { + stepprog.beginStep(3, "Computing pdists", LOG); + } + FiniteProgress prdsProgress = LOG.isVerbose() ? new FiniteProgress("pdists", relation.size(), LOG) : null; + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final KNNList<D> neighbors = knnReach.getKNNForDBID(iditer, kreach); + mean.reset(); + // use first kref neighbors as reference set + int ks = 0; + // TODO: optimize for double distances + if (neighbors instanceof DoubleDistanceKNNList) { + for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { + final double d = neighbor.doubleDistance(); + mean.put(d * d); + ks++; + if (ks >= kreach) { + break; + } + } + } + } else { + for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { + double d = neighbor.getDistance().doubleValue(); + mean.put(d * d); + ks++; + if (ks >= kreach) { + break; + } + } + } + } + double pdist = lambda * Math.sqrt(mean.getMean()); + pdists.putDouble(iditer, pdist); + if (prdsProgress != null) { + prdsProgress.incrementProcessed(LOG); + } + } + } + // Compute PLOF values. + WritableDoubleDataStore plofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + MeanVariance mvplof = new MeanVariance(); + {// compute LOOP_SCORE of each db object + if (stepprog != null) { + stepprog.beginStep(4, "Computing PLOF", LOG); + } + + FiniteProgress progressPLOFs = LOG.isVerbose() ? new FiniteProgress("PLOFs for objects", relation.size(), LOG) : null; + MeanVariance mv = new MeanVariance(); + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + final KNNList<D> neighbors = knnComp.getKNNForDBID(iditer, kcomp); + mv.reset(); + // use first kref neighbors as comparison set. + int ks = 0; + for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) { + mv.put(pdists.doubleValue(neighbor)); + ks++; + if (ks >= kcomp) { + break; + } + } + } + double plof = Math.max(pdists.doubleValue(iditer) / mv.getMean(), 1.0); + if (Double.isNaN(plof) || Double.isInfinite(plof)) { + plof = 1.0; + } + plofs.putDouble(iditer, plof); + mvplof.put((plof - 1.0) * (plof - 1.0)); + + if (progressPLOFs != null) { + progressPLOFs.incrementProcessed(LOG); + } + } + } + + double nplof = lambda * Math.sqrt(mvplof.getMean()); + if (LOG.isDebugging()) { + LOG.verbose("nplof normalization factor is " + nplof + " " + mvplof.getMean() + " " + mvplof.getSampleStddev()); + } + + // Compute final LoOP values. + WritableDoubleDataStore loops = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); + {// compute LOOP_SCORE of each db object + if (stepprog != null) { + stepprog.beginStep(5, "Computing LoOP scores", LOG); + } + + FiniteProgress progressLOOPs = LOG.isVerbose() ? new FiniteProgress("LoOP for objects", relation.size(), LOG) : null; + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + loops.putDouble(iditer, NormalDistribution.erf((plofs.doubleValue(iditer) - 1) / (nplof * sqrt2))); + + if (progressLOOPs != null) { + progressLOOPs.incrementProcessed(LOG); + } + } + } + + if (stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Probabilities", "loop-outlier", TypeUtil.DOUBLE, loops, relation.getDBIDs()); + OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore(); + return new OutlierResult(scoreMeta, scoreResult); + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + final TypeInformation type; + if (reachabilityDistanceFunction.equals(comparisonDistanceFunction)) { + type = reachabilityDistanceFunction.getInputTypeRestriction(); + } else { + type = new CombinedTypeInformation(reachabilityDistanceFunction.getInputTypeRestriction(), comparisonDistanceFunction.getInputTypeRestriction()); + } + return TypeUtil.array(type); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + /** + * Holds the value of {@link #KREACH_ID}. + */ + int kreach = 0; + + /** + * Holds the value of {@link #KCOMP_ID}. + */ + int kcomp = 0; + + /** + * Hold the value of {@link #LAMBDA_ID}. + */ + double lambda = 2.0; + + /** + * Preprocessor Step 1. + */ + protected DistanceFunction<O, D> reachabilityDistanceFunction = null; + + /** + * Preprocessor Step 2. + */ + protected DistanceFunction<O, D> comparisonDistanceFunction = null; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter kcompP = new IntParameter(KCOMP_ID); + kcompP.addConstraint(new GreaterConstraint(1)); + if (config.grab(kcompP)) { + kcomp = kcompP.intValue(); + } + + final ObjectParameter<DistanceFunction<O, D>> compDistP = new ObjectParameter<>(COMPARISON_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class); + if (config.grab(compDistP)) { + comparisonDistanceFunction = compDistP.instantiateClass(config); + } + + final IntParameter kreachP = new IntParameter(KREACH_ID); + kreachP.addConstraint(new GreaterConstraint(1)); + kreachP.setOptional(true); + if (config.grab(kreachP)) { + kreach = kreachP.intValue(); + } else { + kreach = kcomp; + } + + final ObjectParameter<DistanceFunction<O, D>> reachDistP = new ObjectParameter<>(REACHABILITY_DISTANCE_FUNCTION_ID, DistanceFunction.class, true); + if (config.grab(reachDistP)) { + reachabilityDistanceFunction = reachDistP.instantiateClass(config); + } + + // TODO: make default 1.0? + final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, 2.0); + lambdaP.addConstraint(new GreaterConstraint(0.0)); + if (config.grab(lambdaP)) { + lambda = lambdaP.doubleValue(); + } + } + + @Override + protected LoOP<O, D> makeInstance() { + DistanceFunction<O, D> realreach = (reachabilityDistanceFunction != null) ? reachabilityDistanceFunction : comparisonDistanceFunction; + return new LoOP<>(kreach, kcomp, realreach, comparisonDistanceFunction, lambda); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/OnlineLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/OnlineLOF.java new file mode 100644 index 00000000..c01c914f --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/OnlineLOF.java @@ -0,0 +1,418 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures +
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team +
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version. +
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details. +
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.QueryUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
+import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.rknn.RKNNQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.index.preprocessed.knn.AbstractMaterializeKNNPreprocessor;
+import de.lmu.ifi.dbs.elki.index.preprocessed.knn.KNNChangeEvent;
+import de.lmu.ifi.dbs.elki.index.preprocessed.knn.KNNListener;
+import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNAndRKNNPreprocessor;
+import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * Incremental version of the {@link LOF} Algorithm, supports insertions and
+ * removals.
+ *
+ * @author Elke Achtert
+ *
+ * @apiviz.has FlexibleLOF.LOFResult oneway - - updates
+ */
+// TODO: related to publication?
+public class OnlineLOF<O, D extends NumberDistance<D, ?>> extends FlexibleLOF<O, D> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(OnlineLOF.class);
+
+ /**
+ * Constructor.
+ *
+ * @param krefer The number of neighbors for reference
+ * @param kreach The number of neighbors for reachability distance
+ * @param neighborhoodDistanceFunction the neighborhood distance function
+ * @param reachabilityDistanceFunction the reachability distance function
+ */
+ public OnlineLOF(int krefer, int kreach, DistanceFunction<? super O, D> neighborhoodDistanceFunction, DistanceFunction<? super O, D> reachabilityDistanceFunction) {
+ super(krefer, kreach, neighborhoodDistanceFunction, reachabilityDistanceFunction);
+ }
+
+ /**
+ * Performs the Generalized LOF_SCORE algorithm on the given database by
+ * calling {@code #doRunInTime(Database)} and adds a {@link LOFKNNListener} to
+ * the preprocessors.
+ */
+ @Override
+ public OutlierResult run(Database database, Relation<O> relation) {
+ StepProgress stepprog = LOG.isVerbose() ? new StepProgress("OnlineLOF", 3) : null;
+
+ Pair<Pair<KNNQuery<O, D>, KNNQuery<O, D>>, Pair<RKNNQuery<O, D>, RKNNQuery<O, D>>> queries = getKNNAndRkNNQueries(database, relation, stepprog);
+ KNNQuery<O, D> kNNRefer = queries.getFirst().getFirst();
+ KNNQuery<O, D> kNNReach = queries.getFirst().getSecond();
+ RKNNQuery<O, D> rkNNRefer = queries.getSecond().getFirst();
+ RKNNQuery<O, D> rkNNReach = queries.getSecond().getSecond();
+
+ LOFResult<O, D> lofResult = super.doRunInTime(relation.getDBIDs(), kNNRefer, kNNReach, stepprog);
+ lofResult.setRkNNRefer(rkNNRefer);
+ lofResult.setRkNNReach(rkNNReach);
+
+ // add listener
+ KNNListener l = new LOFKNNListener(lofResult);
+ ((MaterializeKNNPreprocessor<O, D>) ((PreprocessorKNNQuery<O, D, ? extends KNNList<D>>) lofResult.getKNNRefer()).getPreprocessor()).addKNNListener(l);
+ ((MaterializeKNNPreprocessor<O, D>) ((PreprocessorKNNQuery<O, D, ? extends KNNList<D>>) lofResult.getKNNReach()).getPreprocessor()).addKNNListener(l);
+
+ return lofResult.getResult();
+ }
+
+ /**
+ * Get the kNN and rkNN queries for the algorithm.
+ *
+ * @param relation Data
+ * @param stepprog Progress logger
+ * @return the kNN and rkNN queries
+ */
+ private Pair<Pair<KNNQuery<O, D>, KNNQuery<O, D>>, Pair<RKNNQuery<O, D>, RKNNQuery<O, D>>> getKNNAndRkNNQueries(Database database, Relation<O> relation, StepProgress stepprog) {
+ // Use "HEAVY" flag, since this is an online algorithm
+ KNNQuery<O, D> kNNRefer = QueryUtil.getKNNQuery(relation, referenceDistanceFunction, krefer, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
+ RKNNQuery<O, D> rkNNRefer = QueryUtil.getRKNNQuery(relation, referenceDistanceFunction, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
+
+ // No optimized kNN query or RkNN query - use a preprocessor!
+ if (kNNRefer == null || rkNNRefer == null) {
+ if (stepprog != null) {
+ stepprog.beginStep(1, "Materializing neighborhood w.r.t. reference neighborhood distance function.", LOG);
+ }
+ MaterializeKNNAndRKNNPreprocessor<O, D> preproc = new MaterializeKNNAndRKNNPreprocessor<>(relation, referenceDistanceFunction, krefer);
+ DistanceQuery<O, D> ndq = database.getDistanceQuery(relation, referenceDistanceFunction);
+ kNNRefer = preproc.getKNNQuery(ndq, krefer, DatabaseQuery.HINT_HEAVY_USE);
+ rkNNRefer = preproc.getRKNNQuery(ndq, krefer, DatabaseQuery.HINT_HEAVY_USE);
+ // add as index
+ relation.getDatabase().addIndex(preproc);
+ } else {
+ if (stepprog != null) {
+ stepprog.beginStep(1, "Optimized neighborhood w.r.t. reference neighborhood distance function provided by database.", LOG);
+ }
+ }
+
+ KNNQuery<O, D> kNNReach = QueryUtil.getKNNQuery(relation, reachabilityDistanceFunction, kreach, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
+ RKNNQuery<O, D> rkNNReach = QueryUtil.getRKNNQuery(relation, reachabilityDistanceFunction, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
+ if (kNNReach == null || rkNNReach == null) {
+ if (stepprog != null) {
+ stepprog.beginStep(2, "Materializing neighborhood w.r.t. reachability distance function.", LOG);
+ }
+ ListParameterization config = new ListParameterization();
+ config.addParameter(AbstractMaterializeKNNPreprocessor.Factory.DISTANCE_FUNCTION_ID, reachabilityDistanceFunction);
+ config.addParameter(AbstractMaterializeKNNPreprocessor.Factory.K_ID, kreach);
+ MaterializeKNNAndRKNNPreprocessor<O, D> preproc = new MaterializeKNNAndRKNNPreprocessor<>(relation, reachabilityDistanceFunction, kreach);
+ DistanceQuery<O, D> rdq = database.getDistanceQuery(relation, reachabilityDistanceFunction);
+ kNNReach = preproc.getKNNQuery(rdq, kreach, DatabaseQuery.HINT_HEAVY_USE);
+ rkNNReach = preproc.getRKNNQuery(rdq, kreach, DatabaseQuery.HINT_HEAVY_USE);
+ // add as index
+ relation.getDatabase().addIndex(preproc);
+ }
+
+ Pair<KNNQuery<O, D>, KNNQuery<O, D>> kNNPair = new Pair<>(kNNRefer, kNNReach);
+ Pair<RKNNQuery<O, D>, RKNNQuery<O, D>> rkNNPair = new Pair<>(rkNNRefer, rkNNReach);
+
+ return new Pair<>(kNNPair, rkNNPair);
+ }
+
+ /**
+ * Encapsulates a listener for changes of kNNs used in the online LOF
+ * algorithm.
+ *
+ * @author Elke Achtert
+ *
+ * @apiviz.exclude
+ */
+ private class LOFKNNListener implements KNNListener {
+ /**
+ * Holds the first event of one of the both preprocessors. The online
+ * algorithm is waiting until both events have been received, i.e. until
+ * both preprocessors are up to date.
+ */
+ private KNNChangeEvent firstEventReceived;
+
+ /**
+ * Holds the result of a former run of the LOF algorithm.
+ */
+ private LOFResult<O, D> lofResult;
+
+ /**
+ * Constructs a listener for the LOF algorithm.
+ *
+ * @param lofResult the result of a former run of the LOF algorithm
+ */
+ public LOFKNNListener(LOFResult<O, D> lofResult) {
+ this.lofResult = lofResult;
+ }
+
+ @Override
+ public void kNNsChanged(KNNChangeEvent e) {
+ AbstractMaterializeKNNPreprocessor<O, D, ?> p1 = ((PreprocessorKNNQuery<O, D, ?>) lofResult.getKNNRefer()).getPreprocessor();
+ AbstractMaterializeKNNPreprocessor<O, D, ?> p2 = ((PreprocessorKNNQuery<O, D, ?>) lofResult.getKNNReach()).getPreprocessor();
+
+ if (firstEventReceived == null) {
+ if (e.getSource().equals(p1) && e.getSource().equals(p2)) {
+ kNNsChanged(e, e);
+ } else {
+ firstEventReceived = e;
+ }
+ } else {
+ if (e.getSource().equals(p1) && firstEventReceived.getSource().equals(p2)) {
+ kNNsChanged(e, firstEventReceived);
+ firstEventReceived = null;
+ } else if (e.getSource().equals(p2) && firstEventReceived.getSource().equals(p1)) {
+ kNNsChanged(firstEventReceived, e);
+ firstEventReceived = null;
+ } else {
+ throw new UnsupportedOperationException("Event sources do not fit!");
+ }
+ }
+ }
+
+ /**
+ * Invoked after the events of both preprocessors have been received, i.e.
+ * until both preprocessors are up to date.
+ *
+ * @param e1 the change event of the first preprocessor
+ * @param e2 the change event of the second preprocessor
+ */
+ private void kNNsChanged(KNNChangeEvent e1, KNNChangeEvent e2) {
+ if (!e1.getType().equals(e2.getType())) {
+ throw new UnsupportedOperationException("Event types do not fit: " + e1.getType() + " != " + e2.getType());
+ }
+ if (!e1.getObjects().equals(e2.getObjects())) {
+ throw new UnsupportedOperationException("Objects do not fit: " + e1.getObjects() + " != " + e2.getObjects());
+ }
+
+ if (e1.getType().equals(KNNChangeEvent.Type.DELETE)) {
+ kNNsRemoved(e1.getObjects(), e1.getUpdates(), e2.getUpdates(), lofResult);
+ } else if (e1.getType().equals(KNNChangeEvent.Type.INSERT)) {
+ kNNsInserted(e1.getObjects(), e1.getUpdates(), e2.getUpdates(), lofResult);
+ } else {
+ throw new UnsupportedOperationException("Unsupported event type: " + e1.getType());
+ }
+ }
+
+ /**
+ * Invoked after kNNs have been inserted and updated, updates the result.
+ *
+ * @param insertions the ids of the newly inserted neighborhoods
+ * @param updates1 the ids of the updated neighborhood w.r.t. the
+ * neighborhood distance function
+ * @param updates2 the ids of the updated neighborhood w.r.t. the
+ * reachability distance function
+ * @param lofResult the result of the former LOF run
+ */
+ private void kNNsInserted(DBIDs insertions, DBIDs updates1, DBIDs updates2, LOFResult<O, D> lofResult) {
+ StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
+
+ // recompute lrds
+ if (stepprog != null) {
+ stepprog.beginStep(1, "Recompute LRDs.", LOG);
+ }
+ ArrayDBIDs lrd_ids = DBIDUtil.ensureArray(DBIDUtil.union(insertions, updates2));
+ List<? extends DistanceDBIDList<D>> reachDistRKNNs = lofResult.getRkNNReach().getRKNNForBulkDBIDs(lrd_ids, kreach);
+ ArrayDBIDs affected_lrd_id_candidates = mergeIDs(reachDistRKNNs, lrd_ids);
+ ArrayModifiableDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size());
+ WritableDoubleDataStore new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach());
+ for (DBIDIter iter = affected_lrd_id_candidates.iter(); iter.valid(); iter.advance()) {
+ double new_lrd = new_lrds.doubleValue(iter);
+ double old_lrd = lofResult.getLrds().doubleValue(iter);
+ if (Double.isNaN(old_lrd) || old_lrd != new_lrd) {
+ lofResult.getLrds().putDouble(iter, new_lrd);
+ affected_lrd_ids.add(iter);
+ }
+ }
+
+ // recompute lofs
+ if (stepprog != null) {
+ stepprog.beginStep(2, "Recompute LOFS.", LOG);
+ }
+ List<? extends DistanceDBIDList<D>> primDistRKNNs = lofResult.getRkNNRefer().getRKNNForBulkDBIDs(affected_lrd_ids, krefer);
+ ArrayDBIDs affected_lof_ids = mergeIDs(primDistRKNNs, affected_lrd_ids, insertions, updates1);
+ recomputeLOFs(affected_lof_ids, lofResult);
+
+ // fire result changed
+ if (stepprog != null) {
+ stepprog.beginStep(3, "Inform listeners.", LOG);
+ }
+ lofResult.getResult().getHierarchy().resultChanged(lofResult.getResult());
+
+ if (stepprog != null) {
+ stepprog.setCompleted(LOG);
+ }
+ }
+
+ /**
+ * Invoked after kNNs have been removed and updated, updates the result.
+ *
+ * @param deletions the ids of the removed neighborhoods
+ * @param updates1 the ids of the updated neighborhood w.r.t. the
+ * neighborhood distance function
+ * @param updates2 the ids of the updated neighborhood w.r.t. the
+ * reachability distance function
+ * @param lofResult the result of the former LOF run
+ */
+ private void kNNsRemoved(DBIDs deletions, DBIDs updates1, DBIDs updates2, LOFResult<O, D> lofResult) {
+ StepProgress stepprog = LOG.isVerbose() ? new StepProgress(4) : null;
+
+ // delete lrds and lofs
+ if (stepprog != null) {
+ stepprog.beginStep(1, "Delete old LRDs and LOFs.", LOG);
+ }
+ for (DBIDIter iter = deletions.iter(); iter.valid(); iter.advance()) {
+ lofResult.getLrds().delete(iter);
+ lofResult.getLofs().delete(iter);
+ }
+
+ // recompute lrds
+ if (stepprog != null) {
+ stepprog.beginStep(2, "Recompute LRDs.", LOG);
+ }
+ ArrayDBIDs lrd_ids = DBIDUtil.ensureArray(updates2);
+ List<? extends DistanceDBIDList<D>> reachDistRKNNs = lofResult.getRkNNReach().getRKNNForBulkDBIDs(lrd_ids, kreach);
+ ArrayDBIDs affected_lrd_id_candidates = mergeIDs(reachDistRKNNs, lrd_ids);
+ ArrayModifiableDBIDs affected_lrd_ids = DBIDUtil.newArray(affected_lrd_id_candidates.size());
+ WritableDoubleDataStore new_lrds = computeLRDs(affected_lrd_id_candidates, lofResult.getKNNReach());
+ for (DBIDIter iter = affected_lrd_id_candidates.iter(); iter.valid(); iter.advance()) {
+ double new_lrd = new_lrds.doubleValue(iter);
+ double old_lrd = lofResult.getLrds().doubleValue(iter);
+ if (old_lrd != new_lrd) {
+ lofResult.getLrds().putDouble(iter, new_lrd);
+ affected_lrd_ids.add(iter);
+ }
+ }
+
+ // recompute lofs
+ if (stepprog != null) {
+ stepprog.beginStep(3, "Recompute LOFS.", LOG);
+ }
+ List<? extends DistanceDBIDList<D>> primDistRKNNs = lofResult.getRkNNRefer().getRKNNForBulkDBIDs(affected_lrd_ids, krefer);
+ ArrayDBIDs affected_lof_ids = mergeIDs(primDistRKNNs, affected_lrd_ids, updates1);
+ recomputeLOFs(affected_lof_ids, lofResult);
+
+ // fire result changed
+ if (stepprog != null) {
+ stepprog.beginStep(4, "Inform listeners.", LOG);
+ }
+ lofResult.getResult().getHierarchy().resultChanged(lofResult.getResult());
+
+ if (stepprog != null) {
+ stepprog.setCompleted(LOG);
+ }
+ }
+
+ /**
+ * Merges the ids of the query result with the specified ids.
+ *
+ * @param queryResults the list of query results
+ * @param ids the list of ids
+ * @return a set containing the ids of the query result and the specified
+ * ids
+ */
+ private ArrayModifiableDBIDs mergeIDs(List<? extends DistanceDBIDList<D>> queryResults, DBIDs... ids) {
+ ModifiableDBIDs result = DBIDUtil.newHashSet();
+ for (DBIDs dbids : ids) {
+ result.addDBIDs(dbids);
+ }
+ for (DistanceDBIDList<D> queryResult : queryResults) {
+ result.addDBIDs(queryResult);
+ }
+ return DBIDUtil.newArray(result);
+ }
+
+ /**
+ * Recomputes the lofs of the specified ids.
+ *
+ * @param ids the ids of the lofs to be recomputed
+ * @param lofResult the result of the former LOF run
+ */
+ private void recomputeLOFs(DBIDs ids, LOFResult<O, D> lofResult) {
+ Pair<WritableDoubleDataStore, DoubleMinMax> lofsAndMax = computeLOFs(ids, lofResult.getLrds(), lofResult.getKNNRefer());
+ WritableDoubleDataStore new_lofs = lofsAndMax.getFirst();
+ for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ lofResult.getLofs().putDouble(iter, new_lofs.doubleValue(iter));
+ }
+ // track the maximum value for normalization.
+ DoubleMinMax new_lofminmax = lofsAndMax.getSecond();
+
+ // Actualize meta info
+ if (new_lofminmax.isValid() && lofResult.getResult().getOutlierMeta().getActualMaximum() < new_lofminmax.getMax()) {
+ BasicOutlierScoreMeta scoreMeta = (BasicOutlierScoreMeta) lofResult.getResult().getOutlierMeta();
+ scoreMeta.setActualMaximum(new_lofminmax.getMax());
+ }
+
+ if (new_lofminmax.isValid() && lofResult.getResult().getOutlierMeta().getActualMinimum() > new_lofminmax.getMin()) {
+ BasicOutlierScoreMeta scoreMeta = (BasicOutlierScoreMeta) lofResult.getResult().getOutlierMeta();
+ scoreMeta.setActualMinimum(new_lofminmax.getMin());
+ }
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends FlexibleLOF.Parameterizer<O, D> {
+ @Override
+ protected OnlineLOF<O, D> makeInstance() {
+ return new OnlineLOF<>(kreach, krefer, distanceFunction, reachabilityDistanceFunction);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java new file mode 100644 index 00000000..2ff7534a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java @@ -0,0 +1,287 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceKNNList; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; +import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; +import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * A simple variant of the LOF algorithm, which uses a simple kernel density + * estimation instead of the local reachability density. + * + * @author Erich Schubert + * + * @apiviz.has KNNQuery + * @apiviz.has KernelDensityFunction + * + * @param <O> the type of objects handled by this Algorithm + * @param <D> Distance type + */ +public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(SimpleKernelDensityLOF.class); + + /** + * Parameter k. + */ + protected int k; + + /** + * Kernel density function + */ + private KernelDensityFunction kernel; + + /** + * Constructor. + * + * @param k the value of k + * @param kernel Kernel function + */ + public SimpleKernelDensityLOF(int k, DistanceFunction<? super O, D> distance, KernelDensityFunction kernel) { + super(distance); + this.k = k + 1; + this.kernel = kernel; + } + + /** + * Run the naive kernel density LOF algorithm. + * + * @param database Database to query + * @param relation Data to process + * @return LOF outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + StepProgress stepprog = LOG.isVerbose() ? new StepProgress("KernelDensityLOF", 3) : null; + + final int dim = RelationUtil.dimensionality(relation); + + DBIDs ids = relation.getDBIDs(); + + // "HEAVY" flag for KNN Query since it is used more than once + KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); + // No optimized kNN query - use a preprocessor! + if (!(knnq instanceof PreprocessorKNNQuery)) { + if (stepprog != null) { + stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG); + } + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); + database.addIndex(preproc); + DistanceQuery<O, D> rdq = database.getDistanceQuery(relation, getDistanceFunction()); + knnq = preproc.getKNNQuery(rdq, k); + } + + // Compute LRDs + if (stepprog != null) { + stepprog.beginStep(2, "Computing densities.", LOG); + } + WritableDoubleDataStore dens = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null; + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); + int count = 0; + double sum = 0.0; + if (neighbors instanceof DoubleDistanceKNNList) { + // Fast version for double distances + for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + double max = ((DoubleDistanceKNNList)knnq.getKNNForDBID(neighbor, k)).doubleKNNDistance(); + final double v = neighbor.doubleDistance() / max; + sum += kernel.density(v) / Math.pow(max, dim); + count++; + } + } else { + for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + double max = knnq.getKNNForDBID(neighbor, k).getKNNDistance().doubleValue(); + final double v = neighbor.getDistance().doubleValue() / max; + sum += kernel.density(v) / Math.pow(max, dim); + count++; + } + } + final double density = sum / count; + dens.putDouble(it, density); + if (densProgress != null) { + densProgress.incrementProcessed(LOG); + } + } + if (densProgress != null) { + densProgress.ensureCompleted(LOG); + } + + // compute LOF_SCORE of each db object + if (stepprog != null) { + stepprog.beginStep(3, "Computing KLOFs.", LOG); + } + WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + // track the maximum value for normalization. + DoubleMinMax lofminmax = new DoubleMinMax(); + + FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("KLOF_SCORE for objects", ids.size(), LOG) : null; + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + final double lrdp = dens.doubleValue(it); + final double lof; + if (lrdp > 0) { + final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); + double sum = 0.0; + int count = 0; + for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + // skip the point itself + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + sum += dens.doubleValue(neighbor); + count++; + } + lof = sum / (count * lrdp); + } else { + lof = 1.0; + } + lofs.putDouble(it, lof); + // update minimum and maximum + lofminmax.put(lof); + + if (progressLOFs != null) { + progressLOFs.incrementProcessed(LOG); + } + } + if (progressLOFs != null) { + progressLOFs.ensureCompleted(LOG); + } + + if (stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Kernel Density Local Outlier Factor", "kernel-density-slof-outlier", TypeUtil.DOUBLE, lofs, ids); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(new CombinedTypeInformation(getDistanceFunction().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD)); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> vector type + * @param <D> distance type + */ + public static class Parameterizer<O extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * Option ID for kernel density LOF kernel. + */ + public static final OptionID KERNEL_ID = new OptionID("kernellof.kernel", "Kernel to use for kernel density LOF."); + + /** + * The neighborhood size to use. + */ + protected int k = 2; + + /** + * Kernel density function parameter + */ + KernelDensityFunction kernel; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID); + pK.addConstraint(new GreaterConstraint(1)); + if (config.grab(pK)) { + k = pK.getValue(); + } + + ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class); + if (config.grab(kernelP)) { + kernel = kernelP.instantiateClass(config); + } + } + + @Override + protected SimpleKernelDensityLOF<O, D> makeInstance() { + return new SimpleKernelDensityLOF<>(k, distanceFunction, kernel); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java new file mode 100644 index 00000000..413eaca1 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java @@ -0,0 +1,264 @@ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; +import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; +import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceKNNList; +import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; +import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; +import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; +import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; +import de.lmu.ifi.dbs.elki.math.DoubleMinMax; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; +import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; +import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta; +import de.lmu.ifi.dbs.elki.utilities.Alias; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * A simplified version of the original LOF algorithm, which does not use the + * reachability distance, yielding less stable results on inliers. + * + * Reference: + * <p> + * Erich Schubert, Arthur Zimek, Hans-Peter Kriegel<br /> + * Local outlier detection reconsidered: a generalized view on locality with + * applications to spatial, video, and network outlier detection<br /> + * In: Data Mining and Knowledge Discovery + * </p> + * + * @author Erich Schubert + * + * @apiviz.has KNNQuery + * + * @param <O> the type of DatabaseObjects handled by this Algorithm + * @param <D> Distance type + */ +@Reference(authors = "Erich Schubert, Arthur Zimek, Hans-Peter Kriegel", title = "Local outlier detection reconsidered: a generalized view on locality with applications to spatial, video, and network outlier detection", booktitle = "Data Mining and Knowledge Discovery", url = "http://dx.doi.org/10.1007/s10618-012-0300-z") +@Alias({ "SimpleLOF", "outlier.SimpleLOF", "de.lmu.ifi.dbs.elki.algorithm.outlier.SimpleLOF" }) +public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(SimplifiedLOF.class); + + /** + * Parameter k. + */ + protected int k; + + /** + * Constructor. + * + * @param k the value of k + */ + public SimplifiedLOF(int k, DistanceFunction<? super O, D> distance) { + super(distance); + this.k = k + 1; + } + + /** + * Run the Simple LOF algorithm. + * + * @param database Database to query + * @param relation Data to process + * @return LOF outlier result + */ + public OutlierResult run(Database database, Relation<O> relation) { + StepProgress stepprog = LOG.isVerbose() ? new StepProgress("SimpleLOF", 3) : null; + + DBIDs ids = relation.getDBIDs(); + + // "HEAVY" flag for KNN Query since it is used more than once + KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE); + // No optimized kNN query - use a preprocessor! + if (!(knnq instanceof PreprocessorKNNQuery)) { + if (stepprog != null) { + stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG); + } + MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k); + database.addIndex(preproc); + DistanceQuery<O, D> rdq = database.getDistanceQuery(relation, getDistanceFunction()); + knnq = preproc.getKNNQuery(rdq, k); + } + + // Compute LRDs + if (stepprog != null) { + stepprog.beginStep(2, "Computing densities.", LOG); + } + WritableDoubleDataStore dens = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); + FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null; + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); + double sum = 0.0; + int count = 0; + if (neighbors instanceof DoubleDistanceKNNList) { + // Fast version for double distances + for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + sum += neighbor.doubleDistance(); + count++; + } + } else { + for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + sum += neighbor.getDistance().doubleValue(); + count++; + } + } + // Avoid division by 0 + final double lrd = (sum > 0) ? (count / sum) : 0; + dens.putDouble(it, lrd); + if (densProgress != null) { + densProgress.incrementProcessed(LOG); + } + } + if (densProgress != null) { + densProgress.ensureCompleted(LOG); + } + + // compute LOF_SCORE of each db object + if (stepprog != null) { + stepprog.beginStep(3, "Computing SLOFs.", LOG); + } + WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + // track the maximum value for normalization. + DoubleMinMax lofminmax = new DoubleMinMax(); + + FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("Simple LOF scores.", ids.size(), LOG) : null; + for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + final double lrdp = dens.doubleValue(it); + final double lof; + if (lrdp > 0) { + final KNNList<D> neighbors = knnq.getKNNForDBID(it, k); + double sum = 0.0; + int count = 0; + for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { + // skip the point itself + if (DBIDUtil.equal(neighbor, it)) { + continue; + } + sum += dens.doubleValue(neighbor); + count++; + } + lof = sum / (count * lrdp); + } else { + lof = 1.0; + } + lofs.putDouble(it, lof); + // update minimum and maximum + lofminmax.put(lof); + + if (progressLOFs != null) { + progressLOFs.incrementProcessed(LOG); + } + } + if (progressLOFs != null) { + progressLOFs.ensureCompleted(LOG); + } + + if (stepprog != null) { + stepprog.setCompleted(LOG); + } + + // Build result representation. + Relation<Double> scoreResult = new MaterializedRelation<>("Simple Local Outlier Factor", "simple-lof-outlier", TypeUtil.DOUBLE, lofs, ids); + OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0); + OutlierResult result = new OutlierResult(scoreMeta, scoreResult); + + return result; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <O> vector type + * @param <D> distance type + */ + public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + /** + * The neighborhood size to use. + */ + protected int k = 2; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + + final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID); + pK.addConstraint(new GreaterConstraint(1)); + if (config.grab(pK)) { + k = pK.getValue(); + } + } + + @Override + protected SimplifiedLOF<O, D> makeInstance() { + return new SimplifiedLOF<>(k, distanceFunction); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/package-info.java new file mode 100644 index 00000000..48d4b16a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/package-info.java @@ -0,0 +1,27 @@ +/** + * <p>LOF family of outlier detection algorithms.</p> + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.outlier.lof; + |