diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java | 109 |
1 files changed, 52 insertions, 57 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java index 489f811b..b8372884 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,8 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.BitSet; - import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; import de.lmu.ifi.dbs.elki.data.NumberVector; @@ -42,10 +40,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation; import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -59,6 +57,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable; import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; @@ -91,12 +90,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @apiviz.has SharedNearestNeighborSimilarityFunction * * @param <V> the type of NumberVector handled by this Algorithm - * @param <D> distance type */ @Title("SOD: Subspace outlier degree") @Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data") @Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2") -public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { +public class SOD<V extends NumberVector> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { /** * The logger for this class. */ @@ -115,7 +113,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte /** * Similarity function to use. */ - private SimilarityFunction<V, D> similarityFunction; + private SimilarityFunction<V> similarityFunction; /** * Report models. @@ -130,7 +128,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @param similarityFunction Shared nearest neighbor similarity function * @param models Report generated models */ - public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction, boolean models) { + public SOD(int knn, double alpha, SimilarityFunction<V> similarityFunction, boolean models) { super(); this.knn = knn; this.alpha = alpha; @@ -145,54 +143,51 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @return Outlier result */ public OutlierResult run(Relation<V> relation) { - SimilarityQuery<V, D> snnInstance = similarityFunction.instantiate(relation); + SimilarityQuery<V> snnInstance = similarityFunction.instantiate(relation); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), LOG) : null; final WritableDoubleDataStore sod_scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC); WritableDataStore<SODModel> sod_models = null; - if (models) { // Models requested + if(models) { // Models requested sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class); } DoubleMinMax minmax = new DoubleMinMax(); - for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { - if (progress != null) { - progress.incrementProcessed(LOG); - } + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + LOG.incrementProcessed(progress); DBIDs neighborhood = getNearestNeighbors(relation, snnInstance, iter); Vector center; - BitSet weightVector; + long[] weightVector; double sod; - if (neighborhood.size() > 0) { + if(neighborhood.size() > 0) { center = Centroid.make(relation, neighborhood); // Note: per-dimension variances; no covariances. double[] variances = computePerDimensionVariances(relation, center, neighborhood); double expectationOfVariance = Mean.of(variances); - weightVector = new BitSet(variances.length); - for (int d = 0; d < variances.length; d++) { - if (variances[d] < alpha * expectationOfVariance) { - weightVector.set(d, true); + weightVector = BitsUtil.zero(variances.length); + for(int d = 0; d < variances.length; d++) { + if(variances[d] < alpha * expectationOfVariance) { + BitsUtil.setI(weightVector, d); } } sod = subspaceOutlierDegree(relation.get(iter), center, weightVector); - } else { + } + else { center = relation.get(iter).getColumnVector(); weightVector = null; sod = 0.; } - if (sod_models != null) { + if(sod_models != null) { sod_models.put(iter, new SODModel(center, weightVector)); } sod_scores.putDouble(iter, sod); minmax.put(sod); } - if (progress != null) { - progress.ensureCompleted(LOG); - } + LOG.ensureCompleted(progress); // combine results. OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax()); - OutlierResult sodResult = new OutlierResult(meta, new MaterializedRelation<>("Subspace Outlier Degree", "sod-outlier", TypeUtil.DOUBLE, sod_scores, relation.getDBIDs())); - if (sod_models != null) { + OutlierResult sodResult = new OutlierResult(meta, new MaterializedDoubleRelation("Subspace Outlier Degree", "sod-outlier", sod_scores, relation.getDBIDs())); + if(sod_models != null) { Relation<SODModel> models = new MaterializedRelation<>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation<>(SODModel.class), sod_models, relation.getDBIDs()); sodResult.addChildResult(models); } @@ -200,9 +195,9 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte } /** - * Provides the k nearest neighbors in terms of the shared nearest neighbor + * Get the k nearest neighbors in terms of the shared nearest neighbor * distance. - * <p/> + * * The query object is excluded from the knn list. * * FIXME: move this to the database layer. @@ -213,20 +208,20 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @return the k nearest neighbors in terms of the shared nearest neighbor * distance without the query object */ - private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V, D> simQ, DBIDRef queryObject) { + private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V> simQ, DBIDRef queryObject) { Heap<DoubleDBIDPair> nearestNeighbors = new TiedTopBoundedHeap<>(knn); - for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { - if (DBIDUtil.equal(iter, queryObject)) { + for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) { + if(DBIDUtil.equal(iter, queryObject)) { continue; } - double sim = simQ.similarity(queryObject, iter).doubleValue(); - if (sim > 0.) { + double sim = simQ.similarity(queryObject, iter); + if(sim > 0.) { nearestNeighbors.add(DBIDUtil.newPair(sim, iter)); } } // Collect DBIDs ArrayModifiableDBIDs dbids = DBIDUtil.newArray(nearestNeighbors.size()); - while (nearestNeighbors.size() > 0) { + while(nearestNeighbors.size() > 0) { dbids.add(nearestNeighbors.poll()); } return dbids; @@ -240,17 +235,17 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @param neighborhood Neighbors * @return Per-dimension variances. */ - private static double[] computePerDimensionVariances(Relation<? extends NumberVector<?>> relation, Vector center, DBIDs neighborhood) { + private static double[] computePerDimensionVariances(Relation<? extends NumberVector> relation, Vector center, DBIDs neighborhood) { double[] c = center.getArrayRef(); double[] variances = new double[c.length]; - for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { - NumberVector<?> databaseObject = relation.get(iter); - for (int d = 0; d < c.length; d++) { + for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { + NumberVector databaseObject = relation.get(iter); + for(int d = 0; d < c.length; d++) { final double deviation = databaseObject.doubleValue(d) - c[d]; variances[d] += deviation * deviation; } } - for (int d = 0; d < variances.length; d++) { + for(int d = 0; d < variances.length; d++) { variances[d] /= neighborhood.size(); } return variances; @@ -264,15 +259,15 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @param weightVector Weight vector * @return sod score */ - private double subspaceOutlierDegree(V queryObject, Vector center, BitSet weightVector) { - final int card = weightVector.cardinality(); - if (card == 0) { + private double subspaceOutlierDegree(V queryObject, Vector center, long[] weightVector) { + final int card = BitsUtil.cardinality(weightVector); + if(card == 0) { return 0; } final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector); - double distance = df.distance(queryObject, center).doubleValue(); - distance /= card; // FIXME: defined as card, should be sqrt(card), - // unfortunately + double distance = df.distance(queryObject, center); + distance /= card; // FIXME: defined and published as card, should be + // sqrt(card), unfortunately return distance; } @@ -300,7 +295,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte /** * Relevant dimensions. */ - private BitSet weightVector; + private long[] weightVector; /** * Initialize SOD Model @@ -308,7 +303,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * @param center Center vector * @param weightVector Selected dimensions */ - public SODModel(Vector center, BitSet weightVector) { + public SODModel(Vector center, long[] weightVector) { this.center = center; this.weightVector = weightVector; } @@ -316,7 +311,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte @Override public void writeToText(TextWriterStream out, String label) { out.commentPrintLn(this.getClass().getSimpleName() + ":"); - out.commentPrintLn("relevant attributes (counting starts with 0): " + this.weightVector.toString()); + out.commentPrintLn("relevant attributes (starting with 0): " + BitsUtil.toString(weightVector, ", ", 0)); out.commentPrintLn("center of neighborhood: " + out.normalizationRestore(center).toString()); out.commentPrintSeparator(); } @@ -329,7 +324,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte * * @apiviz.exclude */ - public static class Parameterizer<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer { + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { /** * Parameter to specify the number of shared nearest neighbors to be * considered for learning the subspace properties., must be an integer @@ -366,7 +361,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte /** * The similarity function. */ - private SimilarityFunction<V, D> similarityFunction; + private SimilarityFunction<V> similarityFunction; /** * Track models. @@ -376,31 +371,31 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - final ObjectParameter<SimilarityFunction<V, D>> simP = new ObjectParameter<>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class); - if (config.grab(simP)) { + final ObjectParameter<SimilarityFunction<V>> simP = new ObjectParameter<>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class); + if(config.grab(simP)) { similarityFunction = simP.instantiateClass(config); } final IntParameter knnP = new IntParameter(KNN_ID); knnP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); - if (config.grab(knnP)) { + if(config.grab(knnP)) { knn = knnP.getValue(); } final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.1); alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); - if (config.grab(alphaP)) { + if(config.grab(alphaP)) { alpha = alphaP.doubleValue(); } final Flag modelsF = new Flag(MODELS_ID); - if (config.grab(modelsF)) { + if(config.grab(modelsF)) { models = modelsF.isTrue(); } } @Override - protected SOD<V, D> makeInstance() { + protected SOD<V> makeInstance() { return new SOD<>(knn, alpha, similarityFunction, models); } } |