package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.Collections; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.ProxyView; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * GLS-Backward Search is a statistical approach to detecting spatial outliers. * *

* F. Chen and C.-T. Lu and A. P. Boedihardjo:
* GLS-SOD: A Generalized Local Statistical Approach for Spatial Outlier * Detection
* In Proc. 16th ACM SIGKDD international conference on Knowledge discovery and * data mining, 2010 *

* * Implementation note: this is just the most basic version of this algorithm. * The spatial relation must be two dimensional, the set of spatial basis * functions is hard-coded (but trivial to enhance) to {1,x,y,x*x,y*y,x*y}, and * we assume the neighborhood is large enough for the simpler formulas to work * that make the optimization problem convex. * * @author Ahmed Hettab * * @param Vector type to use for distances * @param Distance function to use */ @Title("GLS-Backward Search") @Reference(authors = "F. Chen and C.-T. Lu and A. P. Boedihardjo", title = "GLS-SOD: A Generalized Local Statistical Approach for Spatial Outlier Detection", booktitle = "Proc. 16th ACM SIGKDD international conference on Knowledge discovery and data mining", url = "http://dx.doi.org/10.1145/1835804.1835939") public class CTLuGLSBackwardSearchAlgorithm, D extends NumberDistance> extends AbstractDistanceBasedAlgorithm implements OutlierAlgorithm { /** * The logger for this class. */ private static final Logging logger = Logging.getLogger(CTLuGLSBackwardSearchAlgorithm.class); /** * Parameter Alpha - significance niveau */ private double alpha; /** * Parameter k - neighborhood size */ private int k; /** * Constructor. * * @param distanceFunction Distance function * @param k number of nearest neighbors to use * @param alpha Significance niveau */ public CTLuGLSBackwardSearchAlgorithm(DistanceFunction distanceFunction, int k, double alpha) { super(distanceFunction); this.alpha = alpha; this.k = k; } /** * Run the algorithm * * @param relationx Spatial relation * @param relationy Attribute relation * @return Algorithm result */ public OutlierResult run(Relation relationx, Relation> relationy) { WritableDataStore scores = DataStoreUtil.makeStorage(relationx.getDBIDs(), DataStoreFactory.HINT_STATIC, Double.class); DoubleMinMax mm = new DoubleMinMax(0.0, 0.0); // Outlier detection loop { ModifiableDBIDs idview = DBIDUtil.newHashSet(relationx.getDBIDs()); ProxyView proxy = new ProxyView(relationx.getDatabase(), idview, relationx); double phialpha = MathUtil.standardNormalProbit(1.0 - alpha / 2); // Detect outliers while significant. while(true) { Pair candidate = singleIteration(proxy, relationy); if(candidate.second < phialpha) { break; } scores.put(candidate.first, candidate.second); if (!Double.isNaN(candidate.second)) { mm.put(candidate.second); } idview.remove(candidate.first); } // Remaining objects are inliers for(DBID id : idview) { scores.put(id, 0.0); } } Relation scoreResult = new MaterializedRelation("GLSSODBackward", "GLSSODbackward-outlier", TypeUtil.DOUBLE, scores, relationx.getDBIDs()); OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(mm.getMin(), mm.getMax(), 0, Double.POSITIVE_INFINITY, 0); return new OutlierResult(scoreMeta, scoreResult); } /** * Run a single iteration of the GLS-SOD modeling step * * @param relationx Geo relation * @param relationy Attribute relation * @return Top outlier and associated score */ private Pair singleIteration(Relation relationx, Relation> relationy) { final int dim = DatabaseUtil.dimensionality(relationx); final int dimy = DatabaseUtil.dimensionality(relationy); assert (dim == 2); KNNQuery knnQuery = QueryUtil.getKNNQuery(relationx, getDistanceFunction(), k + 1); // We need stable indexed DBIDs ArrayDBIDs ids = DBIDUtil.newArray(relationx.getDBIDs()); // Sort, so we can do a binary search below. Collections.sort(ids); // init F,X,Z Matrix X = new Matrix(ids.size(), 6); Matrix F = new Matrix(ids.size(), ids.size()); Matrix Y = new Matrix(ids.size(), dimy); for(int i = 0; i < ids.size(); i++) { DBID id = ids.get(i); // Fill the data matrix { V vec = relationx.get(id); double la = vec.doubleValue(1); double lo = vec.doubleValue(2); X.set(i, 0, 1.0); X.set(i, 1, la); X.set(i, 2, lo); X.set(i, 3, la * lo); X.set(i, 4, la * la); X.set(i, 5, lo * lo); } { for(int d = 0; d < dimy; d++) { double idy = relationy.get(id).doubleValue(d + 1); Y.set(i, d, idy); } } // Fill the neighborhood matrix F: { List> neighbors = knnQuery.getKNNForDBID(id, k + 1); ModifiableDBIDs neighborhood = DBIDUtil.newArray(neighbors.size()); for(DistanceResultPair dpair : neighbors) { if(id.equals(dpair.getDBID())) { continue; } neighborhood.add(dpair.getDBID()); } // Weight object itself positively. F.set(i, i, 1.0); final int nweight = -1 / neighborhood.size(); // We need to find the index positions of the neighbors, unfortunately. for(DBID nid : neighborhood) { int pos = Collections.binarySearch(ids, nid); assert (pos >= 0); F.set(pos, i, nweight); } } } // Estimate the parameter beta // Common term that we can save recomputing. Matrix common = X.transposeTimesTranspose(F).times(F); Matrix b = common.times(X).inverse().times(common.times(Y)); // Estimate sigma_0 and sigma: // sigma_sum_square = sigma_0*sigma_0 + sigma*sigma Matrix sigmaMat = F.times(X.times(b).minus(F.times(Y))); final double sigma_sum_square = sigmaMat.normF() / (relationx.size() - 6 - 1); final double norm = 1 / Math.sqrt(sigma_sum_square); // calculate the absolute values of standard residuals Matrix E = F.times(Y.minus(X.times(b))).timesEquals(norm); DBID worstid = null; double worstscore = Double.NEGATIVE_INFINITY; for(int i = 0; i < ids.size(); i++) { DBID id = ids.get(i); double err = E.getRowVector(i).euclideanLength(); // double err = Math.abs(E.get(i, 0)); if(err > worstscore) { worstscore = err; worstid = id; } } return new Pair(worstid, worstscore); } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction(), TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return logger; } /** * Parameterization class * * @author Erich Schubert * * @apiviz.exclude * * @param Input vector type * @param Distance type */ public static class Parameterizer, D extends NumberDistance> extends AbstractDistanceBasedAlgorithm.Parameterizer { /** * Holds the alpha value - significance niveau */ public static final OptionID ALPHA_ID = OptionID.getOrCreateOptionID("glsbs.alpha", "Significance niveau"); /** * Parameter to specify the k nearest neighbors */ public static final OptionID K_ID = OptionID.getOrCreateOptionID("glsbs.k", "k nearest neighbors to use"); /** * Parameter Alpha - significance niveau */ private double alpha; /** * Parameter k - neighborhood size */ private int k; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); getParameterAlpha(config); getParameterK(config); } @Override protected CTLuGLSBackwardSearchAlgorithm makeInstance() { return new CTLuGLSBackwardSearchAlgorithm(distanceFunction, k, alpha); } /** * Get the alpha parameter * * @param config Parameterization */ protected void getParameterAlpha(Parameterization config) { final DoubleParameter param = new DoubleParameter(ALPHA_ID); if(config.grab(param)) { alpha = param.getValue(); } } /** * Get the k parameter * * @param config Parameterization */ protected void getParameterK(Parameterization config) { final IntParameter param = new IntParameter(K_ID); if(config.grab(param)) { k = param.getValue(); } } } }