summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java109
1 files changed, 52 insertions, 57 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
index 489f811b..b8372884 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,8 +23,6 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.BitSet;
-
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
import de.lmu.ifi.dbs.elki.data.NumberVector;
@@ -42,10 +40,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
@@ -59,6 +57,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.textwriter.TextWriteable;
import de.lmu.ifi.dbs.elki.result.textwriter.TextWriterStream;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TiedTopBoundedHeap;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
@@ -91,12 +90,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @apiviz.has SharedNearestNeighborSimilarityFunction
*
* @param <V> the type of NumberVector handled by this Algorithm
- * @param <D> distance type
*/
@Title("SOD: Subspace outlier degree")
@Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data")
@Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2")
-public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
+public class SOD<V extends NumberVector> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
/**
* The logger for this class.
*/
@@ -115,7 +113,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
/**
* Similarity function to use.
*/
- private SimilarityFunction<V, D> similarityFunction;
+ private SimilarityFunction<V> similarityFunction;
/**
* Report models.
@@ -130,7 +128,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @param similarityFunction Shared nearest neighbor similarity function
* @param models Report generated models
*/
- public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction, boolean models) {
+ public SOD(int knn, double alpha, SimilarityFunction<V> similarityFunction, boolean models) {
super();
this.knn = knn;
this.alpha = alpha;
@@ -145,54 +143,51 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @return Outlier result
*/
public OutlierResult run(Relation<V> relation) {
- SimilarityQuery<V, D> snnInstance = similarityFunction.instantiate(relation);
+ SimilarityQuery<V> snnInstance = similarityFunction.instantiate(relation);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), LOG) : null;
final WritableDoubleDataStore sod_scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
WritableDataStore<SODModel> sod_models = null;
- if (models) { // Models requested
+ if(models) { // Models requested
sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class);
}
DoubleMinMax minmax = new DoubleMinMax();
- for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
- if (progress != null) {
- progress.incrementProcessed(LOG);
- }
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ LOG.incrementProcessed(progress);
DBIDs neighborhood = getNearestNeighbors(relation, snnInstance, iter);
Vector center;
- BitSet weightVector;
+ long[] weightVector;
double sod;
- if (neighborhood.size() > 0) {
+ if(neighborhood.size() > 0) {
center = Centroid.make(relation, neighborhood);
// Note: per-dimension variances; no covariances.
double[] variances = computePerDimensionVariances(relation, center, neighborhood);
double expectationOfVariance = Mean.of(variances);
- weightVector = new BitSet(variances.length);
- for (int d = 0; d < variances.length; d++) {
- if (variances[d] < alpha * expectationOfVariance) {
- weightVector.set(d, true);
+ weightVector = BitsUtil.zero(variances.length);
+ for(int d = 0; d < variances.length; d++) {
+ if(variances[d] < alpha * expectationOfVariance) {
+ BitsUtil.setI(weightVector, d);
}
}
sod = subspaceOutlierDegree(relation.get(iter), center, weightVector);
- } else {
+ }
+ else {
center = relation.get(iter).getColumnVector();
weightVector = null;
sod = 0.;
}
- if (sod_models != null) {
+ if(sod_models != null) {
sod_models.put(iter, new SODModel(center, weightVector));
}
sod_scores.putDouble(iter, sod);
minmax.put(sod);
}
- if (progress != null) {
- progress.ensureCompleted(LOG);
- }
+ LOG.ensureCompleted(progress);
// combine results.
OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
- OutlierResult sodResult = new OutlierResult(meta, new MaterializedRelation<>("Subspace Outlier Degree", "sod-outlier", TypeUtil.DOUBLE, sod_scores, relation.getDBIDs()));
- if (sod_models != null) {
+ OutlierResult sodResult = new OutlierResult(meta, new MaterializedDoubleRelation("Subspace Outlier Degree", "sod-outlier", sod_scores, relation.getDBIDs()));
+ if(sod_models != null) {
Relation<SODModel> models = new MaterializedRelation<>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation<>(SODModel.class), sod_models, relation.getDBIDs());
sodResult.addChildResult(models);
}
@@ -200,9 +195,9 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
}
/**
- * Provides the k nearest neighbors in terms of the shared nearest neighbor
+ * Get the k nearest neighbors in terms of the shared nearest neighbor
* distance.
- * <p/>
+ *
* The query object is excluded from the knn list.
*
* FIXME: move this to the database layer.
@@ -213,20 +208,20 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @return the k nearest neighbors in terms of the shared nearest neighbor
* distance without the query object
*/
- private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V, D> simQ, DBIDRef queryObject) {
+ private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V> simQ, DBIDRef queryObject) {
Heap<DoubleDBIDPair> nearestNeighbors = new TiedTopBoundedHeap<>(knn);
- for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
- if (DBIDUtil.equal(iter, queryObject)) {
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ if(DBIDUtil.equal(iter, queryObject)) {
continue;
}
- double sim = simQ.similarity(queryObject, iter).doubleValue();
- if (sim > 0.) {
+ double sim = simQ.similarity(queryObject, iter);
+ if(sim > 0.) {
nearestNeighbors.add(DBIDUtil.newPair(sim, iter));
}
}
// Collect DBIDs
ArrayModifiableDBIDs dbids = DBIDUtil.newArray(nearestNeighbors.size());
- while (nearestNeighbors.size() > 0) {
+ while(nearestNeighbors.size() > 0) {
dbids.add(nearestNeighbors.poll());
}
return dbids;
@@ -240,17 +235,17 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @param neighborhood Neighbors
* @return Per-dimension variances.
*/
- private static double[] computePerDimensionVariances(Relation<? extends NumberVector<?>> relation, Vector center, DBIDs neighborhood) {
+ private static double[] computePerDimensionVariances(Relation<? extends NumberVector> relation, Vector center, DBIDs neighborhood) {
double[] c = center.getArrayRef();
double[] variances = new double[c.length];
- for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
- NumberVector<?> databaseObject = relation.get(iter);
- for (int d = 0; d < c.length; d++) {
+ for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
+ NumberVector databaseObject = relation.get(iter);
+ for(int d = 0; d < c.length; d++) {
final double deviation = databaseObject.doubleValue(d) - c[d];
variances[d] += deviation * deviation;
}
}
- for (int d = 0; d < variances.length; d++) {
+ for(int d = 0; d < variances.length; d++) {
variances[d] /= neighborhood.size();
}
return variances;
@@ -264,15 +259,15 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @param weightVector Weight vector
* @return sod score
*/
- private double subspaceOutlierDegree(V queryObject, Vector center, BitSet weightVector) {
- final int card = weightVector.cardinality();
- if (card == 0) {
+ private double subspaceOutlierDegree(V queryObject, Vector center, long[] weightVector) {
+ final int card = BitsUtil.cardinality(weightVector);
+ if(card == 0) {
return 0;
}
final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector);
- double distance = df.distance(queryObject, center).doubleValue();
- distance /= card; // FIXME: defined as card, should be sqrt(card),
- // unfortunately
+ double distance = df.distance(queryObject, center);
+ distance /= card; // FIXME: defined and published as card, should be
+ // sqrt(card), unfortunately
return distance;
}
@@ -300,7 +295,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
/**
* Relevant dimensions.
*/
- private BitSet weightVector;
+ private long[] weightVector;
/**
* Initialize SOD Model
@@ -308,7 +303,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @param center Center vector
* @param weightVector Selected dimensions
*/
- public SODModel(Vector center, BitSet weightVector) {
+ public SODModel(Vector center, long[] weightVector) {
this.center = center;
this.weightVector = weightVector;
}
@@ -316,7 +311,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
@Override
public void writeToText(TextWriterStream out, String label) {
out.commentPrintLn(this.getClass().getSimpleName() + ":");
- out.commentPrintLn("relevant attributes (counting starts with 0): " + this.weightVector.toString());
+ out.commentPrintLn("relevant attributes (starting with 0): " + BitsUtil.toString(weightVector, ", ", 0));
out.commentPrintLn("center of neighborhood: " + out.normalizationRestore(center).toString());
out.commentPrintSeparator();
}
@@ -329,7 +324,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
/**
* Parameter to specify the number of shared nearest neighbors to be
* considered for learning the subspace properties., must be an integer
@@ -366,7 +361,7 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
/**
* The similarity function.
*/
- private SimilarityFunction<V, D> similarityFunction;
+ private SimilarityFunction<V> similarityFunction;
/**
* Track models.
@@ -376,31 +371,31 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- final ObjectParameter<SimilarityFunction<V, D>> simP = new ObjectParameter<>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class);
- if (config.grab(simP)) {
+ final ObjectParameter<SimilarityFunction<V>> simP = new ObjectParameter<>(SIM_ID, SimilarityFunction.class, SharedNearestNeighborSimilarityFunction.class);
+ if(config.grab(simP)) {
similarityFunction = simP.instantiateClass(config);
}
final IntParameter knnP = new IntParameter(KNN_ID);
knnP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
- if (config.grab(knnP)) {
+ if(config.grab(knnP)) {
knn = knnP.getValue();
}
final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.1);
alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
- if (config.grab(alphaP)) {
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
final Flag modelsF = new Flag(MODELS_ID);
- if (config.grab(modelsF)) {
+ if(config.grab(modelsF)) {
models = modelsF.isTrue();
}
}
@Override
- protected SOD<V, D> makeInstance() {
+ protected SOD<V> makeInstance() {
return new SOD<>(knn, alpha, similarityFunction, models);
}
}