diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java | 544 |
1 files changed, 73 insertions, 471 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java index ad0b8175..65447713 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java @@ -23,59 +23,45 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.HashMap; - -import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; -import de.lmu.ifi.dbs.elki.database.QueryUtil; +import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; -import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; -import de.lmu.ifi.dbs.elki.database.ids.DBIDRange; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; -import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList; -import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.distance.similarityfunction.PrimitiveSimilarityFunction; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix; import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.PolynomialKernelFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MeanVariance; -import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap; -import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; -import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * Angle-Based Outlier Detection + * Angle-Based Outlier Detection / Angle-Based Outlier Factor. * * Outlier detection using variance analysis on angles, especially for high - * dimensional data sets. + * dimensional data sets. Exact version, which has cubic runtime (see also + * {@link FastABOD} and {@link LBABOD} for faster versions). * * H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in * High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge @@ -84,475 +70,107 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @author Matthias Schubert (Original Code) * @author Erich Schubert (ELKIfication) * - * @apiviz.has KNNQuery - * * @param <V> Vector type */ @Title("ABOD: Angle-Based Outlier Detection") @Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.") @Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946") -public class ABOD<V extends NumberVector<?>> extends AbstractDistanceBasedAlgorithm<V, DoubleDistance, OutlierResult> implements OutlierAlgorithm { +public class ABOD<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(ABOD.class); /** - * Parameter for k, the number of neighbors used in kNN queries. - */ - public static final OptionID K_ID = new OptionID("abod.k", "Parameter k for kNN queries."); - - /** - * Parameter for sample size to be used in fast mode. - */ - public static final OptionID FAST_SAMPLE_ID = new OptionID("abod.samplesize", "Sample size to enable fast mode."); - - /** - * Parameter for the kernel function. - */ - public static final OptionID KERNEL_FUNCTION_ID = new OptionID("abod.kernelfunction", "Kernel function to use."); - - /** - * The preprocessor used to materialize the kNN neighborhoods. - */ - public static final OptionID PREPROCESSOR_ID = new OptionID("abod.knnquery", "Processor to compute the kNN neighborhoods."); - - /** - * use alternate code below. - */ - private static final boolean USE_RND_SAMPLE = false; - - /** - * k parameter. - */ - private int k; - - /** - * Variable to store fast mode sampling value. - */ - int sampleSize = 0; - - /** * Store the configured Kernel version. */ - private PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction; - - /** - * Static DBID map. - */ - private ArrayDBIDs staticids = null; + protected SimilarityFunction<? super V, DoubleDistance> kernelFunction; /** - * Actual constructor, with parameters. Fast mode (sampling). + * Constructor for Angle-Based Outlier Detection (ABOD). * - * @param k k parameter - * @param sampleSize sample size - * @param primitiveKernelFunction Kernel function to use - * @param distanceFunction Distance function + * @param kernelFunction kernel function to use */ - public ABOD(int k, int sampleSize, PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction, DistanceFunction<V, DoubleDistance> distanceFunction) { - super(distanceFunction); - this.k = k; - this.sampleSize = sampleSize; - this.primitiveKernelFunction = primitiveKernelFunction; + public ABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction) { + super(); + this.kernelFunction = kernelFunction; } /** - * Actual constructor, with parameters. Slow mode (exact). + * Run ABOD on the data set. * - * @param k k parameter - * @param primitiveKernelFunction kernel function to use - * @param distanceFunction Distance function + * @param relation Relation to process + * @return Outlier detection result */ - public ABOD(int k, PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction, DistanceFunction<V, DoubleDistance> distanceFunction) { - super(distanceFunction); - this.k = k; - this.sampleSize = 0; - this.primitiveKernelFunction = primitiveKernelFunction; - } + public OutlierResult run(Database db, Relation<V> relation) { + DBIDs ids = relation.getDBIDs(); + // Build a kernel matrix, to make O(n^3) slightly less bad. + SimilarityQuery<V, DoubleDistance> sq = db.getSimilarityQuery(relation, kernelFunction); + KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids); - /** - * Main part of the algorithm. Exact version. - * - * @param relation Relation to query - * @return result - */ - public OutlierResult getRanking(Relation<V> relation) { - // Fix a static set of IDs - if (relation.getDBIDs() instanceof DBIDRange) { - staticids = (DBIDRange) relation.getDBIDs(); - } else { - staticids = DBIDUtil.newArray(relation.getDBIDs()); - ((ArrayModifiableDBIDs) staticids).sort(); - } - - KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); - ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(relation.size()); - - // preprocess kNN neighborhoods - KNNQuery<V, DoubleDistance> knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k); + WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); + DoubleMinMax minmaxabod = new DoubleMinMax(); MeanVariance s = new MeanVariance(); - for (DBIDIter objKey = relation.iterDBIDs(); objKey.valid(); objKey.advance()) { - s.reset(); - - KNNList<DoubleDistance> neighbors = knnQuery.getKNNForDBID(objKey, k); - for (DBIDIter key1 = neighbors.iter(); key1.valid(); key1.advance()) { - for (DBIDIter key2 = neighbors.iter(); key2.valid(); key2.advance()) { - if (DBIDUtil.equal(key2, key1) || DBIDUtil.equal(key1, objKey) || DBIDUtil.equal(key2, objKey)) { - continue; - } - double nenner = calcDenominator(kernelMatrix, objKey, key1, key2); - - if (nenner != 0) { - double sqrtnenner = Math.sqrt(nenner); - double tmp = calcNumerator(kernelMatrix, objKey, key1, key2) / nenner; - s.put(tmp, 1 / sqrtnenner); - } - - } - } - // Sample variance probably would be correct, however the numerical - // instabilities can actually break ABOD here. - pq.add(DBIDUtil.newPair(s.getNaiveVariance(), objKey)); + for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) { + final double abof = computeABOF(relation, kernelMatrix, pA, s); + minmaxabod.put(abof); + abodvalues.putDouble(pA, abof); } - DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); - while (!pq.isEmpty()) { - DoubleDBIDPair pair = pq.poll(); - abodvalues.putDouble(pair, pair.doubleValue()); - minmaxabod.put(pair.doubleValue()); - } // Build result representation. - Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); + Relation<Double> scoreResult = new MaterializedRelation<>("Angle-Based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs()); OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); return new OutlierResult(scoreMeta, scoreResult); } /** - * Main part of the algorithm. Fast version. + * Compute the exact ABOF value. * - * @param relation Relation to use - * @return result - */ - public OutlierResult getFastRanking(Relation<V> relation) { - final DBIDs ids = relation.getDBIDs(); - // Fix a static set of IDs - // TODO: add a DBIDUtil.ensureSorted? - if (relation.getDBIDs() instanceof DBIDRange) { - staticids = (DBIDRange) relation.getDBIDs(); - } else { - staticids = DBIDUtil.newArray(relation.getDBIDs()); - ((ArrayModifiableDBIDs) staticids).sort(); - } - - KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids); - - ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(relation.size()); - // get Candidate Ranking - for (DBIDIter aKey = relation.iterDBIDs(); aKey.valid(); aKey.advance()) { - WritableDoubleDataStore dists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT); - // determine kNearestNeighbors and pairwise distances - ComparableMinHeap<DoubleDBIDPair> nn; - if (!USE_RND_SAMPLE) { - nn = calcDistsandNN(relation, kernelMatrix, sampleSize, aKey, dists); - } else { - // alternative: - nn = calcDistsandRNDSample(relation, kernelMatrix, sampleSize, aKey, dists); - } - - // get normalization - double[] counter = calcFastNormalization(aKey, dists, staticids); - // umsetzen von Pq zu list - ModifiableDBIDs neighbors = DBIDUtil.newArray(nn.size()); - while (!nn.isEmpty()) { - neighbors.add(nn.poll()); - } - // getFilter - double var = getAbofFilter(kernelMatrix, aKey, dists, counter[1], counter[0], neighbors); - pq.add(DBIDUtil.newPair(var, aKey)); - } - // refine Candidates - ComparableMinHeap<DoubleDBIDPair> resqueue = new ComparableMinHeap<>(k); - MeanVariance s = new MeanVariance(); - while (!pq.isEmpty()) { - if (resqueue.size() == k && pq.peek().doubleValue() > resqueue.peek().doubleValue()) { - break; - } - // double approx = pq.peek().getFirst(); - DBIDRef aKey = pq.poll(); - s.reset(); - for (DBIDIter bKey = relation.iterDBIDs(); bKey.valid(); bKey.advance()) { - if (DBIDUtil.equal(bKey, aKey)) { - continue; - } - for (DBIDIter cKey = relation.iterDBIDs(); cKey.valid(); cKey.advance()) { - if (DBIDUtil.equal(cKey, aKey)) { - continue; - } - // double nenner = dists[y]*dists[z]; - double nenner = calcDenominator(kernelMatrix, aKey, bKey, cKey); - if (nenner != 0) { - double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner; - double sqrtNenner = Math.sqrt(nenner); - s.put(tmp, 1 / sqrtNenner); - } - } - } - double var = s.getSampleVariance(); - if (resqueue.size() < k) { - resqueue.add(DBIDUtil.newPair(var, aKey)); - } else { - if (resqueue.peek().doubleValue() > var) { - resqueue.replaceTopElement(DBIDUtil.newPair(var, aKey)); - } - } - - } - DoubleMinMax minmaxabod = new DoubleMinMax(); - WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); - while (!pq.isEmpty()) { - DoubleDBIDPair pair = pq.poll(); - abodvalues.putDouble(pair, pair.doubleValue()); - minmaxabod.put(pair.doubleValue()); - } - // Build result representation. - Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids); - OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY); - return new OutlierResult(scoreMeta, scoreResult); - } - - private double[] calcFastNormalization(DBIDRef x, WritableDoubleDataStore dists, DBIDs ids) { - double[] result = new double[2]; - - double sum = 0; - double sumF = 0; - for (DBIDIter yKey = ids.iter(); yKey.valid(); yKey.advance()) { - if (dists.doubleValue(yKey) != 0) { - double tmp = 1 / Math.sqrt(dists.doubleValue(yKey)); - sum += tmp; - sumF += (1 / dists.doubleValue(yKey)) * tmp; - } - } - double sofar = 0; - double sofarF = 0; - for (DBIDIter zKey = ids.iter(); zKey.valid(); zKey.advance()) { - if (dists.doubleValue(zKey) != 0) { - double tmp = 1 / Math.sqrt(dists.doubleValue(zKey)); - sofar += tmp; - double rest = sum - sofar; - result[0] += tmp * rest; - - sofarF += (1 / dists.doubleValue(zKey)) * tmp; - double restF = sumF - sofarF; - result[1] += (1 / dists.doubleValue(zKey)) * tmp * restF; - } - } - return result; - } - - private double getAbofFilter(KernelMatrix kernelMatrix, DBIDRef aKey, WritableDoubleDataStore dists, double fulCounter, double counter, DBIDs neighbors) { - double sum = 0.0; - double sqrSum = 0.0; - double partCounter = 0; - for (DBIDIter bKey = neighbors.iter(); bKey.valid(); bKey.advance()) { - if (DBIDUtil.equal(bKey, aKey)) { + * @param relation Relation + * @param kernelMatrix Kernel matrix + * @param pA Object A to compute ABOF for + * @param s Statistics tracker + * @return ABOF value + */ + protected double computeABOF(Relation<V> relation, KernelMatrix kernelMatrix, DBIDRef pA, MeanVariance s) { + s.reset(); // Reused + double simAA = kernelMatrix.getSimilarity(pA, pA); + + for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) { + if (DBIDUtil.equal(nB, pA)) { continue; } - for (DBIDIter cKey = neighbors.iter(); cKey.valid(); cKey.advance()) { - if (DBIDUtil.equal(cKey, aKey)) { - continue; - } - if (DBIDUtil.compare(bKey, cKey) > 0) { - double nenner = dists.doubleValue(bKey) * dists.doubleValue(cKey); - if (nenner != 0) { - double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner; - double sqrtNenner = Math.sqrt(nenner); - sum += tmp * (1 / sqrtNenner); - sqrSum += tmp * tmp * (1 / sqrtNenner); - partCounter += (1 / (sqrtNenner * nenner)); - } - } - } - } - // TODO: Document the meaning / use of fulCounter, partCounter. - double mu = (sum + (fulCounter - partCounter)) / counter; - return (sqrSum / counter) - (mu * mu); - } - - /** - * Compute the cosinus value between vectors aKey and bKey. - * - * @param kernelMatrix - * @param aKey - * @param bKey - * @return cosinus value - */ - private double calcCos(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey) { - final int ai = mapDBID(aKey); - final int bi = mapDBID(bKey); - return kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, bi) - 2 * kernelMatrix.getDistance(ai, bi); - } - - private int mapDBID(DBIDRef aKey) { - // TODO: this is not the most efficient... - int off = staticids.binarySearch(aKey); - if (off < 0) { - throw new AbortException("Did not find id " + aKey.toString() + " in staticids. " + staticids.contains(aKey)); - } - return off + 1; - } - - private double calcDenominator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) { - return calcCos(kernelMatrix, aKey, bKey) * calcCos(kernelMatrix, aKey, cKey); - } - - private double calcNumerator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) { - final int ai = mapDBID(aKey); - final int bi = mapDBID(bKey); - final int ci = mapDBID(cKey); - return (kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, ci) - kernelMatrix.getDistance(ai, ci) - kernelMatrix.getDistance(ai, bi)); - } - - private ComparableMinHeap<DoubleDBIDPair> calcDistsandNN(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) { - ComparableMinHeap<DoubleDBIDPair> nn = new ComparableMinHeap<>(sampleSize); - for (DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) { - double val = calcCos(kernelMatrix, aKey, bKey); - dists.putDouble(bKey, val); - if (nn.size() < sampleSize) { - nn.add(DBIDUtil.newPair(val, bKey)); - } else { - if (val < nn.peek().doubleValue()) { - nn.replaceTopElement(DBIDUtil.newPair(val, bKey)); - } - } - } - return nn; - } - - private ComparableMinHeap<DoubleDBIDPair> calcDistsandRNDSample(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) { - ComparableMinHeap<DoubleDBIDPair> nn = new ComparableMinHeap<>(sampleSize); - int step = (int) ((double) data.size() / (double) sampleSize); - int counter = 0; - for (DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) { - double val = calcCos(kernelMatrix, aKey, bKey); - dists.putDouble(bKey, val); - if (counter % step == 0) { - nn.add(DBIDUtil.newPair(val, bKey)); + double simBB = kernelMatrix.getSimilarity(nB, nB); + double simAB = kernelMatrix.getSimilarity(pA, nB); + double sqdAB = simAA + simBB - simAB - simAB; + if (!(sqdAB > 0.)) { + continue; } - counter++; - } - return nn; - } - - /** - * Get explanations for points in the database. - * - * @param data to get explanations for - * @return String explanation - */ - // TODO: this should be done by the result classes. - public String getExplanations(Relation<V> data) { - KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, data, staticids); - // PQ for Outlier Ranking - ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(data.size()); - HashMap<DBID, DBIDs> explaintab = new HashMap<>(); - // test all objects - MeanVariance s = new MeanVariance(), s2 = new MeanVariance(); - for (DBIDIter objKey = data.iterDBIDs(); objKey.valid(); objKey.advance()) { - s.reset(); - // Queue for the best explanation - ComparableMinHeap<DoubleDBIDPair> explain = new ComparableMinHeap<>(); - // determine Object - // for each pair of other objects - for (DBIDIter key1 = data.iterDBIDs(); key1.valid(); key1.advance()) { - // Collect Explanation Vectors - s2.reset(); - if (DBIDUtil.equal(objKey, key1)) { + for (DBIDIter nC = relation.iterDBIDs(); nC.valid(); nC.advance()) { + if (DBIDUtil.equal(nC, pA) || DBIDUtil.compare(nC, nB) < 0) { continue; } - for (DBIDIter key2 = data.iterDBIDs(); key2.valid(); key2.advance()) { - if (DBIDUtil.equal(key2, key1) || DBIDUtil.equal(objKey, key2)) { - continue; - } - double nenner = calcDenominator(kernelMatrix, objKey, key1, key2); - if (nenner != 0) { - double tmp = calcNumerator(kernelMatrix, objKey, key1, key2) / nenner; - double sqr = Math.sqrt(nenner); - s2.put(tmp, 1 / sqr); - } - } - explain.add(DBIDUtil.newPair(s2.getSampleVariance(), key1)); - s.put(s2); - } - // build variance of the observed vectors - pq.add(DBIDUtil.newPair(s.getSampleVariance(), objKey)); - // - ModifiableDBIDs expList = DBIDUtil.newArray(); - expList.add(explain.poll()); - while (!explain.isEmpty()) { - DBIDRef nextKey = explain.poll(); - if (DBIDUtil.equal(nextKey, objKey)) { + double simCC = kernelMatrix.getSimilarity(nC, nC); + double simAC = kernelMatrix.getSimilarity(pA, nC); + double sqdAC = simAA + simCC - simAC; + if (!(sqdAC > 0.)) { continue; } - double max = Double.MIN_VALUE; - for (DBIDIter exp = expList.iter(); exp.valid(); exp.advance()) { - if (DBIDUtil.equal(exp, objKey) || DBIDUtil.equal(nextKey, exp)) { - continue; - } - double nenner = Math.sqrt(calcCos(kernelMatrix, objKey, nextKey)) * Math.sqrt(calcCos(kernelMatrix, objKey, exp)); - double angle = calcNumerator(kernelMatrix, objKey, nextKey, exp) / nenner; - max = Math.max(angle, max); - } - if (max < 0.5) { - expList.add(nextKey); - } - } - explaintab.put(DBIDUtil.deref(objKey), expList); - } - StringBuilder buf = new StringBuilder(); - buf.append("Result: ABOD\n"); - int count = 0; - while (!pq.isEmpty()) { - if (count > 10) { - break; + // Exploit bilinearity of scalar product: + // <B-A, C-A> = <B, C-A> - <A,C-A> + // = <B,C> - <B,A> - <A,C> + <A,A> + // For computing variance, AA is a constant and can be ignored. + double simBC = kernelMatrix.getSimilarity(nB, nC); + double numerator = simBC - simAB - simAC; // + simAA; + double val = numerator / (sqdAB * sqdAC); + s.put(val, 1. / Math.sqrt(sqdAB * sqdAC)); } - double factor = pq.peek().doubleValue(); - DBIDRef key = pq.poll(); - buf.append(data.get(key)).append(' '); - buf.append(count).append(" Factor=").append(factor).append(' ').append(key).append('\n'); - DBIDs expList = explaintab.get(key); - generateExplanation(buf, data, key, expList); - count++; - } - return buf.toString(); - } - - private void generateExplanation(StringBuilder buf, Relation<V> data, DBIDRef key, DBIDs expList) { - Vector vect1 = data.get(key).getColumnVector(); - for (DBIDIter iter = expList.iter(); iter.valid(); iter.advance()) { - buf.append("Outlier: ").append(vect1).append('\n'); - Vector exp = data.get(iter).getColumnVector(); - buf.append("Most common neighbor: ").append(exp).append('\n'); - // determine difference Vector - Vector vals = exp.minus(vect1); - buf.append(vals).append('\n'); - } - } - - /** - * Run ABOD on the data set. - * - * @param relation Relation to process - * @return Outlier detection result - */ - public OutlierResult run(Relation<V> relation) { - if (sampleSize > 0) { - return getFastRanking(relation); - } else { - return getRanking(relation); } + // Sample variance probably would be correct, but the ABOD publication + // uses the naive variance. + final double abof = s.getNaiveVariance(); + return abof; } @Override @@ -572,45 +190,29 @@ public class ABOD<V extends NumberVector<?>> extends AbstractDistanceBasedAlgori * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<V, DoubleDistance> { - /** - * k Parameter. - */ - protected int k = 0; - + public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer { /** - * Sample size. + * Parameter for the kernel function. */ - protected int sampleSize = 0; + public static final OptionID KERNEL_FUNCTION_ID = new OptionID("abod.kernelfunction", "Kernel function to use."); /** * Distance function. */ - protected PrimitiveSimilarityFunction<V, DoubleDistance> primitiveKernelFunction = null; + protected SimilarityFunction<V, DoubleDistance> kernelFunction = null; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - final IntParameter kP = new IntParameter(K_ID, 30); - kP.addConstraint(new GreaterEqualConstraint(1)); - if (config.grab(kP)) { - k = kP.getValue(); - } - final IntParameter sampleSizeP = new IntParameter(FAST_SAMPLE_ID); - sampleSizeP.addConstraint(new GreaterEqualConstraint(1)); - sampleSizeP.setOptional(true); - if (config.grab(sampleSizeP)) { - sampleSize = sampleSizeP.getValue(); - } - final ObjectParameter<PrimitiveSimilarityFunction<V, DoubleDistance>> param = new ObjectParameter<>(KERNEL_FUNCTION_ID, PrimitiveSimilarityFunction.class, PolynomialKernelFunction.class); + final ObjectParameter<SimilarityFunction<V, DoubleDistance>> param = new ObjectParameter<>(KERNEL_FUNCTION_ID, SimilarityFunction.class, PolynomialKernelFunction.class); if (config.grab(param)) { - primitiveKernelFunction = param.instantiateClass(config); + kernelFunction = param.instantiateClass(config); } } @Override protected ABOD<V> makeInstance() { - return new ABOD<>(k, sampleSize, primitiveKernelFunction, distanceFunction); + return new ABOD<>(kernelFunction); } } } |