summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java88
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java16
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java93
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java97
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java21
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java51
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java364
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java350
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java59
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java148
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java153
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java302
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java900
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java28
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java25
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java96
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java89
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java5
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java217
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java195
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java33
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java346
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java155
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java11
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java8
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java53
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java75
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java23
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java384
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java27
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java605
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java160
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java7
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java1000
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java544
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java137
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java93
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java407
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java24
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java219
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java288
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java16
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java71
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java10
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java14
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java2
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java32
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java66
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java15
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java203
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java93
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java9
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java50
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java3
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java59
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java116
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java61
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java3
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java371
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java4
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java44
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java6
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java46
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java90
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java24
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java20
96 files changed, 7348 insertions, 2209 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java
index 07aaf3fc..a2f32989 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/APRIORI.java
@@ -44,8 +44,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OneMustBeSetGlobalConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OnlyOneIsAllowedToBeSetGlobalConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -296,7 +295,7 @@ public class APRIORI extends AbstractAlgorithm<AprioriResult> {
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.BIT_VECTOR_FIELD);
}
-
+
@Override
protected Logging getLogger() {
return LOG;
@@ -325,15 +324,15 @@ public class APRIORI extends AbstractAlgorithm<AprioriResult> {
super.makeOptions(config);
DoubleParameter minfreqP = new DoubleParameter(MINFREQ_ID);
minfreqP.setOptional(true);
- minfreqP.addConstraint(new GreaterEqualConstraint(0));
- minfreqP.addConstraint(new LessEqualConstraint(1));
+ minfreqP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ minfreqP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
if(config.grab(minfreqP)) {
minfreq = minfreqP.getValue();
}
IntParameter minsuppP = new IntParameter(MINSUPP_ID);
minsuppP.setOptional(true);
- minsuppP.addConstraint(new GreaterEqualConstraint(0));
+ minsuppP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(minsuppP)) {
minsupp = minsuppP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java
index 68ac9595..65b86633 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractAlgorithm.java
@@ -85,72 +85,48 @@ public abstract class AbstractAlgorithm<R extends Result> implements Algorithm {
}
// Find appropriate run method.
- Method runmethod1 = null;
- Method runmethod2 = null;
try {
- runmethod1 = this.getClass().getMethod("run", signature1);
- runmethod2 = null;
- }
- catch(SecurityException e) {
- throw new APIViolationException("Security exception finding an appropriate 'run' method.", e);
+ Method runmethod1 = this.getClass().getMethod("run", signature1);
+ return (R) runmethod1.invoke(this, relations1);
}
catch(NoSuchMethodException e) {
- runmethod1 = null;
- // Try without "database" parameter.
- try {
- runmethod2 = this.getClass().getMethod("run", signature2);
- }
- catch(NoSuchMethodException e2) {
- runmethod2 = null;
+ // continue below.
+ }
+ catch(IllegalArgumentException | IllegalAccessException | SecurityException e) {
+ throw new APIViolationException("Invoking the real 'run' method failed.", e);
+ }
+ catch(InvocationTargetException e) {
+ final Throwable cause = e.getTargetException();
+ if(cause instanceof RuntimeException) {
+ throw (RuntimeException) cause;
}
- catch(SecurityException e2) {
- throw new APIViolationException("Security exception finding an appropriate 'run' method.", e2);
+ if(cause instanceof Error) {
+ throw (Error) cause;
}
+ throw new APIViolationException("Invoking the real 'run' method failed: " + cause.toString(), cause);
}
- if(runmethod1 != null) {
- try {
- return (R) runmethod1.invoke(this, relations1);
- }
- catch(IllegalArgumentException e) {
- throw new APIViolationException("Invoking the real 'run' method failed.", e);
- }
- catch(IllegalAccessException e) {
- throw new APIViolationException("Invoking the real 'run' method failed.", e);
- }
- catch(InvocationTargetException e) {
- if(e.getTargetException() instanceof RuntimeException) {
- throw (RuntimeException) e.getTargetException();
- }
- if(e.getTargetException() instanceof AssertionError) {
- throw (AssertionError) e.getTargetException();
- }
- throw new APIViolationException("Invoking the real 'run' method failed: " + e.getTargetException().toString(), e.getTargetException());
- }
+ try {
+ Method runmethod2 = this.getClass().getMethod("run", signature2);
+ return (R) runmethod2.invoke(this, relations2);
}
- else if(runmethod2 != null) {
- try {
- return (R) runmethod2.invoke(this, relations2);
- }
- catch(IllegalArgumentException e) {
- throw new APIViolationException("Invoking the real 'run' method failed.", e);
- }
- catch(IllegalAccessException e) {
- throw new APIViolationException("Invoking the real 'run' method failed.", e);
+ catch(NoSuchMethodException e) {
+ // continue below.
+ }
+ catch(IllegalArgumentException | IllegalAccessException | SecurityException e) {
+ throw new APIViolationException("Invoking the real 'run' method failed.", e);
+ }
+ catch(InvocationTargetException e) {
+ final Throwable cause = e.getTargetException();
+ if(cause instanceof RuntimeException) {
+ throw (RuntimeException) cause;
}
- catch(InvocationTargetException e) {
- if(e.getTargetException() instanceof RuntimeException) {
- throw (RuntimeException) e.getTargetException();
- }
- if(e.getTargetException() instanceof AssertionError) {
- throw (AssertionError) e.getTargetException();
- }
- throw new APIViolationException("Invoking the real 'run' method failed: " + e.getTargetException().toString(), e.getTargetException());
+ if(cause instanceof Error) {
+ throw (Error) cause;
}
+ throw new APIViolationException("Invoking the real 'run' method failed: " + cause.toString(), cause);
}
- else {
- throw new APIViolationException("No appropriate 'run' method found.");
- }
+ throw new APIViolationException("No appropriate 'run' method found.");
}
/**
@@ -177,6 +153,6 @@ public abstract class AbstractAlgorithm<R extends Result> implements Algorithm {
* @return Parameter object
*/
public static <F extends DistanceFunction<?, ?>> ObjectParameter<F> makeParameterDistanceFunction(Class<?> defaultDistanceFunction, Class<?> restriction) {
- return new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, restriction, defaultDistanceFunction);
+ return new ObjectParameter<>(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, restriction, defaultDistanceFunction);
}
} \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java
index 40fe67c3..5d4b24c1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/AbstractPrimitiveDistanceBasedAlgorithm.java
@@ -48,7 +48,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
public abstract class AbstractPrimitiveDistanceBasedAlgorithm<O, D extends Distance<?>, R extends Result> extends AbstractAlgorithm<R> {
/**
* Holds the instance of the distance function specified by
- * {@link AbstractDistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}.
+ * {@link DistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}.
*/
protected PrimitiveDistanceFunction<? super O, D> distanceFunction;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java
index cc40d13b..dca3649e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/DependencyDerivator.java
@@ -51,8 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -68,7 +67,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
* E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, A. Zimek: Deriving
* Quantitative Dependencies for Correlation Clusters. <br>
* In Proc. 12th Int. Conf. on Knowledge Discovery and Data Mining (KDD '06),
- * Philadelphia, PA 2006. </p>
+ * Philadelphia, PA 2006.
+ * </p>
*
* @author Arthur Zimek
* @param <V> the type of FeatureVector handled by this Algorithm
@@ -303,20 +303,20 @@ public class DependencyDerivator<V extends NumberVector<?>, D extends Distance<D
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
-
+
IntParameter outputAccuracyP = new IntParameter(OUTPUT_ACCURACY_ID, 4);
- outputAccuracyP.addConstraint(new GreaterEqualConstraint(0));
+ outputAccuracyP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(outputAccuracyP)) {
outputAccuracy = outputAccuracyP.getValue();
}
-
+
IntParameter sampleSizeP = new IntParameter(SAMPLE_SIZE_ID);
sampleSizeP.setOptional(true);
- sampleSizeP.addConstraint(new GreaterConstraint(0));
+ sampleSizeP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(sampleSizeP)) {
sampleSize = sampleSizeP.getValue();
}
-
+
Flag randomSampleF = new Flag(DEPENDENCY_DERIVATOR_RANDOM_SAMPLE);
if(config.grab(randomSampleF)) {
randomSample = randomSampleF.getValue();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java
index b696ed36..46cf2246 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNDistanceOrder.java
@@ -43,8 +43,7 @@ import de.lmu.ifi.dbs.elki.result.KNNDistanceOrderResult;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -164,14 +163,14 @@ public class KNNDistanceOrder<O, D extends Distance<D>> extends AbstractDistance
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID, 1);
- kP.addConstraint(new GreaterConstraint(0));
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
k = kP.getValue();
}
DoubleParameter percentageP = new DoubleParameter(PERCENTAGE_ID, 1.0);
- percentageP.addConstraint(new GreaterConstraint(0));
- percentageP.addConstraint(new LessEqualConstraint(1));
+ percentageP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ percentageP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
if(config.grab(percentageP)) {
percentage = percentageP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java
index dddd8fdb..0f5078fb 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/KNNJoin.java
@@ -61,7 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -121,11 +121,11 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
*/
@SuppressWarnings("unchecked")
public WritableDataStore<KNNList<D>> run(Database database, Relation<V> relation) {
- if (!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) {
+ if(!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) {
throw new IllegalStateException("Distance Function must be an instance of " + SpatialPrimitiveDistanceFunction.class.getName());
}
Collection<SpatialIndexTree<N, E>> indexes = ResultUtil.filterResults(database, SpatialIndexTree.class);
- if (indexes.size() != 1) {
+ if(indexes.size() != 1) {
throw new AbortException("KNNJoin found " + indexes.size() + " spatial indexes, expected exactly one.");
}
// FIXME: Ensure were looking at the right relation!
@@ -140,7 +140,7 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
ComparableMinHeap<Task> pq = new ComparableMinHeap<>(ps_candidates.size() * ps_candidates.size() / 10);
// Initialize with the page self-pairing
- for (int i = 0; i < ps_candidates.size(); i++) {
+ for(int i = 0; i < ps_candidates.size(); i++) {
E pr_entry = ps_candidates.get(i);
N pr = index.getNode(pr_entry);
heaps.add(initHeaps(distFunction, pr));
@@ -148,41 +148,42 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
// Build priority queue
final int sqsize = ps_candidates.size() * (ps_candidates.size() - 1) >> 1;
- if (LOG.isDebuggingFine()) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Number of leaves: " + ps_candidates.size() + " so " + sqsize + " MBR computations.");
}
FiniteProgress mprogress = LOG.isVerbose() ? new FiniteProgress("Comparing leaf MBRs", sqsize, LOG) : null;
- for (int i = 0; i < ps_candidates.size(); i++) {
+ for(int i = 0; i < ps_candidates.size(); i++) {
E pr_entry = ps_candidates.get(i);
List<KNNHeap<D>> pr_heaps = heaps.get(i);
D pr_knn_distance = computeStopDistance(pr_heaps);
- for (int j = i + 1; j < ps_candidates.size(); j++) {
+ for(int j = i + 1; j < ps_candidates.size(); j++) {
E ps_entry = ps_candidates.get(j);
List<KNNHeap<D>> ps_heaps = heaps.get(j);
D ps_knn_distance = computeStopDistance(ps_heaps);
D minDist = distFunction.minDist(pr_entry, ps_entry);
// Resolve immediately:
- if (minDist.isNullDistance()) {
+ if(minDist.isNullDistance()) {
N pr = index.getNode(ps_candidates.get(i));
N ps = index.getNode(ps_candidates.get(j));
processDataPagesOptimize(distFunction, pr_heaps, ps_heaps, pr, ps);
- } else if (minDist.compareTo(pr_knn_distance) <= 0 || minDist.compareTo(ps_knn_distance) <= 0) {
+ }
+ else if(minDist.compareTo(pr_knn_distance) <= 0 || minDist.compareTo(ps_knn_distance) <= 0) {
pq.add(new Task(minDist, i, j));
}
- if (mprogress != null) {
+ if(mprogress != null) {
mprogress.incrementProcessed(LOG);
}
}
}
- if (mprogress != null) {
+ if(mprogress != null) {
mprogress.ensureCompleted(LOG);
}
// Process the queue
FiniteProgress qprogress = LOG.isVerbose() ? new FiniteProgress("Processing queue", pq.size(), LOG) : null;
IndefiniteProgress fprogress = LOG.isVerbose() ? new IndefiniteProgress("Full comparisons", LOG) : null;
- while (!pq.isEmpty()) {
+ while(!pq.isEmpty()) {
Task task = pq.poll();
List<KNNHeap<D>> pr_heaps = heaps.get(task.i);
List<KNNHeap<D>> ps_heaps = heaps.get(task.j);
@@ -190,30 +191,32 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
D ps_knn_distance = computeStopDistance(ps_heaps);
boolean dor = task.mindist.compareTo(pr_knn_distance) <= 0;
boolean dos = task.mindist.compareTo(ps_knn_distance) <= 0;
- if (dor || dos) {
+ if(dor || dos) {
N pr = index.getNode(ps_candidates.get(task.i));
N ps = index.getNode(ps_candidates.get(task.j));
- if (dor && dos) {
+ if(dor && dos) {
processDataPagesOptimize(distFunction, pr_heaps, ps_heaps, pr, ps);
- } else {
- if (dor) {
+ }
+ else {
+ if(dor) {
processDataPagesOptimize(distFunction, pr_heaps, null, pr, ps);
- } else /* dos */{
+ }
+ else /* dos */{
processDataPagesOptimize(distFunction, ps_heaps, null, ps, pr);
}
}
- if (fprogress != null) {
+ if(fprogress != null) {
fprogress.incrementProcessed(LOG);
}
}
- if (qprogress != null) {
+ if(qprogress != null) {
qprogress.incrementProcessed(LOG);
}
}
- if (qprogress != null) {
+ if(qprogress != null) {
qprogress.ensureCompleted(LOG);
}
- if (fprogress != null) {
+ if(fprogress != null) {
fprogress.setCompleted(LOG);
}
@@ -223,12 +226,12 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
// null;
FiniteProgress pageprog = LOG.isVerbose() ? new FiniteProgress("Number of processed data pages", ps_candidates.size(), LOG) : null;
// int processed = 0;
- for (int i = 0; i < ps_candidates.size(); i++) {
+ for(int i = 0; i < ps_candidates.size(); i++) {
N pr = index.getNode(ps_candidates.get(i));
List<KNNHeap<D>> pr_heaps = heaps.get(i);
// Finalize lists
- for (int j = 0; j < pr.getNumEntries(); j++) {
+ for(int j = 0; j < pr.getNumEntries(); j++) {
knnLists.put(((LeafEntry) pr.getEntry(j)).getDBID(), pr_heaps.get(j).toKNNList());
}
// Forget heaps and pq
@@ -238,14 +241,14 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
// if(progress != null) {
// progress.setProcessed(processed, logger);
// }
- if (pageprog != null) {
+ if(pageprog != null) {
pageprog.incrementProcessed(LOG);
}
}
// if(progress != null) {
// progress.ensureCompleted(logger);
// }
- if (pageprog != null) {
+ if(pageprog != null) {
pageprog.ensureCompleted(LOG);
}
return knnLists;
@@ -261,7 +264,7 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
private List<KNNHeap<D>> initHeaps(SpatialPrimitiveDistanceFunction<V, D> distFunction, N pr) {
List<KNNHeap<D>> pr_heaps = new ArrayList<>(pr.getNumEntries());
// Create for each data object a knn heap
- for (int j = 0; j < pr.getNumEntries(); j++) {
+ for(int j = 0; j < pr.getNumEntries(); j++) {
pr_heaps.add(DBIDUtil.newHeap(distFunction.getDistanceFactory(), k));
}
// Self-join first, as this is expected to improve most and cannot be
@@ -282,20 +285,21 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
*/
@SuppressWarnings("unchecked")
private void processDataPagesOptimize(SpatialPrimitiveDistanceFunction<V, D> distFunction, List<? extends KNNHeap<D>> pr_heaps, List<? extends KNNHeap<D>> ps_heaps, N pr, N ps) {
- if (DistanceUtil.isDoubleDistanceFunction(distFunction)) {
+ if(DistanceUtil.isDoubleDistanceFunction(distFunction)) {
List<?> khp = (List<?>) pr_heaps;
List<?> khs = (List<?>) ps_heaps;
processDataPagesDouble((SpatialPrimitiveDoubleDistanceFunction<? super V>) distFunction, pr, ps, (List<DoubleDistanceKNNHeap>) khp, (List<DoubleDistanceKNNHeap>) khs);
- } else {
- for (int j = 0; j < ps.getNumEntries(); j++) {
+ }
+ else {
+ for(int j = 0; j < ps.getNumEntries(); j++) {
final SpatialPointLeafEntry s_e = (SpatialPointLeafEntry) ps.getEntry(j);
DBID s_id = s_e.getDBID();
- for (int i = 0; i < pr.getNumEntries(); i++) {
+ for(int i = 0; i < pr.getNumEntries(); i++) {
final SpatialPointLeafEntry r_e = (SpatialPointLeafEntry) pr.getEntry(i);
D distance = distFunction.minDist(s_e, r_e);
- pr_heaps.get(i).add(distance, s_id);
- if (pr != ps && ps_heaps != null) {
- ps_heaps.get(j).add(distance, r_e.getDBID());
+ pr_heaps.get(i).insert(distance, s_id);
+ if(pr != ps && ps_heaps != null) {
+ ps_heaps.get(j).insert(distance, r_e.getDBID());
}
}
}
@@ -314,15 +318,15 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
*/
private void processDataPagesDouble(SpatialPrimitiveDoubleDistanceFunction<? super V> df, N pr, N ps, List<DoubleDistanceKNNHeap> pr_heaps, List<DoubleDistanceKNNHeap> ps_heaps) {
// Compare pairwise
- for (int j = 0; j < ps.getNumEntries(); j++) {
+ for(int j = 0; j < ps.getNumEntries(); j++) {
final SpatialPointLeafEntry s_e = (SpatialPointLeafEntry) ps.getEntry(j);
DBID s_id = s_e.getDBID();
- for (int i = 0; i < pr.getNumEntries(); i++) {
+ for(int i = 0; i < pr.getNumEntries(); i++) {
final SpatialPointLeafEntry r_e = (SpatialPointLeafEntry) pr.getEntry(i);
double distance = df.doubleMinDist(s_e, r_e);
- pr_heaps.get(i).add(distance, s_id);
- if (pr != ps && ps_heaps != null) {
- ps_heaps.get(j).add(distance, r_e.getDBID());
+ pr_heaps.get(i).insert(distance, s_id);
+ if(pr != ps && ps_heaps != null) {
+ ps_heaps.get(j).insert(distance, r_e.getDBID());
}
}
}
@@ -337,15 +341,16 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
private D computeStopDistance(List<KNNHeap<D>> heaps) {
// Update pruning distance
D pr_knn_distance = null;
- for (KNNHeap<D> knnList : heaps) {
+ for(KNNHeap<D> knnList : heaps) {
// set kNN distance of r
- if (pr_knn_distance == null) {
+ if(pr_knn_distance == null) {
pr_knn_distance = knnList.getKNNDistance();
- } else {
+ }
+ else {
pr_knn_distance = DistanceUtil.max(knnList.getKNNDistance(), pr_knn_distance);
}
}
- if (pr_knn_distance == null) {
+ if(pr_knn_distance == null) {
return getDistanceFunction().getDistanceFactory().infiniteDistance();
}
return pr_knn_distance;
@@ -421,8 +426,8 @@ public class KNNJoin<V extends NumberVector<?>, D extends Distance<D>, N extends
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID, 1);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java b/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java
index 3d0ea52a..8b83b5d4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/benchmark/ValidateApproximativeKNNIndex.java
@@ -36,9 +36,9 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.LinearScanQuery;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
-import de.lmu.ifi.dbs.elki.database.query.knn.LinearScanKNNQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.datasource.DatabaseConnection;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
@@ -141,32 +141,35 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
// Approximate query:
KNNQuery<O, D> knnQuery = database.getKNNQuery(distQuery, k, DatabaseQuery.HINT_OPTIMIZED_ONLY);
- if (knnQuery == null || knnQuery instanceof LinearScanKNNQuery) {
+ if(knnQuery == null || knnQuery instanceof LinearScanQuery) {
throw new AbortException("Expected an accelerated query, but got a linear scan -- index is not used.");
}
// Exact query:
KNNQuery<O, D> truekNNQuery;
- if (forcelinear) {
+ if(forcelinear) {
truekNNQuery = QueryUtil.getLinearScanKNNQuery(distQuery);
- } else {
+ }
+ else {
truekNNQuery = database.getKNNQuery(distQuery, k, DatabaseQuery.HINT_EXACT);
}
- if (knnQuery.getClass().equals(truekNNQuery.getClass())) {
+ if(knnQuery.getClass().equals(truekNNQuery.getClass())) {
LOG.warning("Query classes are the same. This experiment may be invalid!");
}
// No query set - use original database.
- if (queries == null || pattern != null) {
+ if(queries == null || pattern != null) {
// Relation to filter on
Relation<String> lrel = (pattern != null) ? DatabaseUtil.guessLabelRepresentation(database) : null;
final DBIDs sample;
- if (sampling <= 0) {
+ if(sampling <= 0) {
sample = relation.getDBIDs();
- } else if (sampling < 1.1) {
+ }
+ else if(sampling < 1.1) {
int size = (int) Math.min(sampling * relation.size(), relation.size());
sample = DBIDUtil.randomSample(relation.getDBIDs(), size, random);
- } else {
+ }
+ else {
int size = (int) Math.min(sampling, relation.size());
sample = DBIDUtil.randomSample(relation.getDBIDs(), size, random);
}
@@ -174,8 +177,8 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
MeanVariance mv = new MeanVariance(), mvrec = new MeanVariance();
MeanVariance mvdist = new MeanVariance(), mvdaerr = new MeanVariance(), mvdrerr = new MeanVariance();
int misses = 0;
- for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
- if (pattern == null || pattern.matcher(lrel.get(iditer)).find()) {
+ for(DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
+ if(pattern == null || pattern.matcher(lrel.get(iditer)).find()) {
// Query index:
KNNList<D> knns = knnQuery.getKNNForDBID(iditer, k);
// Query reference:
@@ -187,53 +190,55 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
// Put recall:
mvrec.put(DBIDUtil.intersectionSize(knns, trueknns) / trueknns.size());
- if (knns.size() >= k) {
+ if(knns.size() >= k) {
D kdist = knns.getKNNDistance();
- if (kdist instanceof NumberDistance) {
+ if(kdist instanceof NumberDistance) {
final double dist = ((NumberDistance<?, ?>) kdist).doubleValue();
final double tdist = ((NumberDistance<?, ?>) trueknns.getKNNDistance()).doubleValue();
- if (tdist > 0.0) {
+ if(tdist > 0.0) {
mvdist.put(dist);
mvdaerr.put(dist - tdist);
mvdrerr.put(dist / tdist);
}
}
- } else {
+ }
+ else {
// Less than k objects.
misses++;
}
}
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
- if (LOG.isStatistics()) {
+ if(LOG.isStatistics()) {
LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
LOG.statistics("Recall of true results: " + mvrec.getMean() + " +- " + mvrec.getNaiveStddev());
- if (mvdist.getCount() > 0) {
+ if(mvdist.getCount() > 0) {
LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
LOG.statistics("Mean absolute k-error: " + mvdaerr.getMean() + " +- " + mvdaerr.getNaiveStddev());
LOG.statistics("Mean relative k-error: " + mvdrerr.getMean() + " +- " + mvdrerr.getNaiveStddev());
}
- if (misses > 0) {
+ if(misses > 0) {
LOG.statistics(String.format("Number of queries that returned less than k=%d objects: %d (%.2f%%)", k, misses, misses * 100. / mv.getCount()));
}
}
- } else {
+ }
+ else {
// Separate query set.
TypeInformation res = getDistanceFunction().getInputTypeRestriction();
MultipleObjectsBundle bundle = queries.loadData();
int col = -1;
- for (int i = 0; i < bundle.metaLength(); i++) {
- if (res.isAssignableFromType(bundle.meta(i))) {
+ for(int i = 0; i < bundle.metaLength(); i++) {
+ if(res.isAssignableFromType(bundle.meta(i))) {
col = i;
break;
}
}
- if (col < 0) {
+ if(col < 0) {
throw new AbortException("No compatible data type in query input was found. Expected: " + res.toString());
}
// Random sampling is a bit of hack, sorry.
@@ -241,12 +246,14 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
DBIDRange sids = DBIDUtil.generateStaticDBIDRange(bundle.dataLength());
final DBIDs sample;
- if (sampling <= 0) {
+ if(sampling <= 0) {
sample = sids;
- } else if (sampling < 1.1) {
+ }
+ else if(sampling < 1.1) {
int size = (int) Math.min(sampling * relation.size(), relation.size());
sample = DBIDUtil.randomSample(sids, size, random);
- } else {
+ }
+ else {
int size = (int) Math.min(sampling, sids.size());
sample = DBIDUtil.randomSample(sids, size, random);
}
@@ -254,7 +261,7 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
MeanVariance mv = new MeanVariance(), mvrec = new MeanVariance();
MeanVariance mvdist = new MeanVariance(), mvdaerr = new MeanVariance(), mvdrerr = new MeanVariance();
int misses = 0;
- for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
int off = sids.binarySearch(iditer);
assert (off >= 0);
@SuppressWarnings("unchecked")
@@ -271,36 +278,37 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
// Put recall:
mvrec.put(DBIDUtil.intersectionSize(knns, trueknns) / trueknns.size());
- if (knns.size() >= k) {
+ if(knns.size() >= k) {
D kdist = knns.getKNNDistance();
- if (kdist instanceof NumberDistance) {
+ if(kdist instanceof NumberDistance) {
final double dist = ((NumberDistance<?, ?>) kdist).doubleValue();
final double tdist = ((NumberDistance<?, ?>) trueknns.getKNNDistance()).doubleValue();
- if (tdist > 0.0) {
+ if(tdist > 0.0) {
mvdist.put(dist);
mvdaerr.put(dist - tdist);
mvdrerr.put(dist / tdist);
}
}
- } else {
+ }
+ else {
// Less than k objects.
misses++;
}
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
- if (LOG.isStatistics()) {
+ if(LOG.isStatistics()) {
LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
LOG.statistics("Recall of true results: " + mvrec.getMean() + " +- " + mvrec.getNaiveStddev());
- if (mvdist.getCount() > 0) {
+ if(mvdist.getCount() > 0) {
LOG.statistics("Mean absolute k-error: " + mvdaerr.getMean() + " +- " + mvdaerr.getNaiveStddev());
LOG.statistics("Mean relative k-error: " + mvdrerr.getMean() + " +- " + mvdrerr.getNaiveStddev());
}
- if (misses > 0) {
+ if(misses > 0) {
LOG.statistics(String.format("Number of queries that returned less than k=%d objects: %d (%.2f%%)", k, misses, misses * 100. / mv.getCount()));
}
}
@@ -393,31 +401,32 @@ public class ValidateApproximativeKNNIndex<O, D extends Distance<D>> extends Abs
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID);
- if (config.grab(kP)) {
+ if(config.grab(kP)) {
k = kP.intValue();
}
PatternParameter patternP = new PatternParameter(PATTERN_ID);
patternP.setOptional(true);
- if (config.grab(patternP)) {
+ if(config.grab(patternP)) {
pattern = patternP.getValue();
- } else {
+ }
+ else {
ObjectParameter<DatabaseConnection> queryP = new ObjectParameter<>(QUERY_ID, DatabaseConnection.class);
queryP.setOptional(true);
- if (config.grab(queryP)) {
+ if(config.grab(queryP)) {
queries = queryP.instantiateClass(config);
}
}
DoubleParameter samplingP = new DoubleParameter(SAMPLING_ID);
samplingP.setOptional(true);
- if (config.grab(samplingP)) {
+ if(config.grab(samplingP)) {
sampling = samplingP.doubleValue();
}
Flag forceP = new Flag(FORCE_ID);
- if (config.grab(forceP)) {
+ if(config.grab(forceP)) {
forcelinear = forceP.isTrue();
}
RandomParameter randomP = new RandomParameter(RANDOM_ID, RandomFactory.DEFAULT);
- if (config.grab(randomP)) {
+ if(config.grab(randomP)) {
random = randomP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
index 0c4eb5fc..96c95a9f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedClustering.java
@@ -35,7 +35,7 @@ import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistance
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -152,8 +152,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext
*/
protected void configK(Parameterization config) {
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
}
@@ -165,8 +165,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext
*/
protected void configKI(Parameterization config) {
IntParameter k_iP = new IntParameter(K_I_ID, 30);
- k_iP.addConstraint(new GreaterConstraint(0));
- if (config.grab(k_iP)) {
+ k_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(k_iP)) {
k_i = k_iP.getValue();
}
}
@@ -178,8 +178,8 @@ public abstract class AbstractProjectedClustering<R extends Clustering<?>, V ext
*/
protected void configL(Parameterization config) {
IntParameter lP = new IntParameter(L_ID);
- lP.addConstraint(new GreaterConstraint(0));
- if (config.grab(lP)) {
+ lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(lP)) {
l = lP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
index ee3b234c..52e37197 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/AbstractProjectedDBSCAN.java
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -294,7 +294,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
ModifiableDBIDs seeds = DBIDUtil.newHashSet();
- for (DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) {
+ for(DistanceDBIDListIter<DoubleDistance> seed = neighbors.iter(); seed.valid(); seed.advance()) {
int nextID_corrDim = distFunc.getIndex().getLocalProjection(seed).getCorrelationDimension();
// nextID is not reachable from start object
if(nextID_corrDim > lambda) {
@@ -322,9 +322,9 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
DistanceDBIDList<DoubleDistance> reachables = rangeQuery.getRangeForDBID(iter, epsilon);
iter.remove();
-
+
if(reachables.size() > minpts) {
- for (DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) {
+ for(DistanceDBIDListIter<DoubleDistance> r = reachables.iter(); r.valid(); r.advance()) {
int corrDim_r = distFunc.getIndex().getLocalProjection(r).getCorrelationDimension();
// r is not reachable from q
if(corrDim_r > lambda) {
@@ -351,9 +351,10 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
}
}
- /* if(processedIDs.size() == relation.size() && noise.size() == 0) {
- break;
- } */
+ /*
+ * if(processedIDs.size() == relation.size() && noise.size() == 0) {
+ * break; }
+ */
}
if(currentCluster.size() >= minpts) {
@@ -375,7 +376,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(distanceFunction.getInputTypeRestriction());
}
-
+
/**
* Parameterization class.
*
@@ -411,7 +412,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
protected void configMinPts(Parameterization config) {
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
@@ -435,7 +436,7 @@ public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V ext
protected void configLambda(Parameterization config) {
IntParameter lambdaP = new IntParameter(LAMBDA_ID);
- lambdaP.addConstraint(new GreaterConstraint(0));
+ lambdaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(lambdaP)) {
lambda = lambdaP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
index 57dcb435..09c78fec 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DBSCAN.java
@@ -38,9 +38,8 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
@@ -52,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -82,24 +81,12 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
private static final Logging LOG = Logging.getLogger(DBSCAN.class);
/**
- * Parameter to specify the maximum radius of the neighborhood to be
- * considered, must be suitable to the distance function specified.
+ * Holds the epsilon radius threshold.
*/
- public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered.");
+ protected D epsilon;
/**
- * Holds the value of {@link #EPSILON_ID}.
- */
- private D epsilon;
-
- /**
- * Parameter to specify the threshold for minimum number of points in the
- * epsilon-neighborhood of a point, must be an integer greater than 0.
- */
- public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point.");
-
- /**
- * Holds the value of {@link #MINPTS_ID}.
+ * Holds the minimum cluster size.
*/
protected int minpts;
@@ -146,7 +133,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
if(size < minpts) {
// The can't be any clusters
noise.addDBIDs(relation.getDBIDs());
- objprog.setProcessed(noise.size(), LOG);
+ if(objprog != null) {
+ objprog.setProcessed(noise.size(), LOG);
+ }
}
else {
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
@@ -193,7 +182,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
* @param objprog the progress object for logging the current status
*/
protected void expandCluster(Relation<O> relation, RangeQuery<O, D> rangeQuery, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
- DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
+ DBIDs neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
// startObject is no core-object
if(neighbors.size() < minpts) {
@@ -207,7 +196,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
// try to expand the cluster
- HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
+ ModifiableDBIDs seeds = DBIDUtil.newHashSet();
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
for(DBIDIter seed = neighbors.iter(); seed.valid(); seed.advance()) {
if(!processedIDs.contains(seed)) {
@@ -222,9 +211,9 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
seeds.remove(startObjectID);
- while(seeds.size() > 0) {
+ while(!seeds.isEmpty()) {
DBIDMIter o = seeds.iter();
- DistanceDBIDList<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon);
+ DBIDs neighborhood = rangeQuery.getRangeForDBID(o, epsilon);
o.remove();
if(neighborhood.size() >= minpts) {
@@ -282,6 +271,18 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
* @apiviz.exclude
*/
public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ /**
+ * Parameter to specify the maximum radius of the neighborhood to be
+ * considered, must be suitable to the distance function specified.
+ */
+ public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered.");
+
+ /**
+ * Parameter to specify the threshold for minimum number of points in the
+ * epsilon-neighborhood of a point, must be an integer greater than 0.
+ */
+ public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point.");
+
protected D epsilon = null;
protected int minpts = 0;
@@ -295,7 +296,7 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
@@ -306,4 +307,4 @@ public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
return new DBSCAN<>(distanceFunction, epsilon, minpts);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
index 3c2e0278..814b4cc4 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
@@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -496,7 +496,7 @@ public class DeLiClu<NV extends NumberVector<?>, D extends Distance<D>> extends
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
index c66442a1..e82ec674 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/EM.java
@@ -34,6 +34,7 @@ import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.EMModel;
+import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
@@ -41,14 +42,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
@@ -57,8 +59,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -72,8 +73,8 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* zero-covariance and variance=1 in covariance matrices.
* </p>
* <p>
- * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin: Maximum Likelihood from
- * Incomplete Data via the EM algorithm. <br>
+ * Reference: A. P. Dempster, N. M. Laird, D. B. Rubin:<br />
+ * Maximum Likelihood from Incomplete Data via the EM algorithm.<br>
* In Journal of the Royal Statistical Society, Series B, 39(1), 1977, pp. 1-31
* </p>
*
@@ -100,48 +101,36 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
private static final double SINGULARITY_CHEAT = 1E-9;
/**
- * Parameter to specify the number of clusters to find, must be an integer
- * greater than 0.
- */
- public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find.");
-
- /**
- * Holds the value of {@link #K_ID}.
+ * Number of clusters
*/
private int k;
/**
- * Parameter to specify the termination criterion for maximization of E(M):
- * E(M) - E(M') < em.delta, must be a double equal to or greater than 0.
+ * Delta parameter
*/
- public static final OptionID DELTA_ID = new OptionID("em.delta", "The termination criterion for maximization of E(M): " + "E(M) - E(M') < em.delta");
+ private double delta;
/**
- * Parameter to specify the initialization method
+ * Class to choose the initial means
*/
- public static final OptionID INIT_ID = new OptionID("kmeans.initialization", "Method to choose the initial means.");
-
- private static final double MIN_LOGLIKELIHOOD = -100000;
+ private KMeansInitialization<V> initializer;
/**
- * Holds the value of {@link #DELTA_ID}.
+ * Maximum number of iterations to allow
*/
- private double delta;
+ private int maxiter;
/**
- * Store the individual probabilities, for use by EMOutlierDetection etc.
+ * Retain soft assignments.
*/
- private WritableDataStore<double[]> probClusterIGivenX;
+ private boolean soft;
- /**
- * Class to choose the initial means
- */
- private KMeansInitialization<V> initializer;
+ private static final double MIN_LOGLIKELIHOOD = -100000;
/**
- * Maximum number of iterations to allow
+ * Soft assignment result type.
*/
- private int maxiter;
+ public static final SimpleTypeInformation<double[]> SOFT_TYPE = new SimpleTypeInformation<>(double[].class);
/**
* Constructor.
@@ -150,13 +139,15 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @param delta delta parameter
* @param initializer Class to choose the initial means
* @param maxiter Maximum number of iterations
+ * @param soft Include soft assignments
*/
- public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter) {
+ public EM(int k, double delta, KMeansInitialization<V> initializer, int maxiter, boolean soft) {
super();
this.k = k;
this.delta = delta;
this.initializer = initializer;
this.maxiter = maxiter;
+ this.setSoft(soft);
}
/**
@@ -172,137 +163,80 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @return Result
*/
public Clustering<EMModel<V>> run(Database database, Relation<V> relation) {
- if (relation.size() == 0) {
+ if(relation.size() == 0) {
throw new IllegalArgumentException("database empty: must contain elements");
}
// initial models
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("initializing " + k + " models");
}
- List<Vector> means = new ArrayList<>();
- for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC)) {
- means.add(nv.getColumnVector());
+ final List<V> initialMeans = initializer.chooseInitialMeans(database, relation, k, EuclideanDistanceFunction.STATIC);
+ assert (initialMeans.size() == k);
+ Vector[] means = new Vector[k];
+ {
+ int i = 0;
+ for(NumberVector<?> nv : initialMeans) {
+ means[i] = nv.getColumnVector();
+ i++;
+ }
}
- List<Matrix> covarianceMatrices = new ArrayList<>(k);
+ Matrix[] covarianceMatrices = new Matrix[k];
double[] normDistrFactor = new double[k];
- List<Matrix> invCovMatr = new ArrayList<>(k);
+ Matrix[] invCovMatr = new Matrix[k];
double[] clusterWeights = new double[k];
- probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
- final int dimensionality = means.get(0).getDimensionality();
- for (int i = 0; i < k; i++) {
+ final int dimensionality = means[0].getDimensionality();
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dimensionality);
+ for(int i = 0; i < k; i++) {
Matrix m = Matrix.identity(dimensionality, dimensionality);
- covarianceMatrices.add(m);
- final double det = m.det();
- if (det > 0.) {
- normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det);
- } else {
- LOG.warning("Encountered matrix with 0 determinant - degenerated.");
- normDistrFactor[i] = 1.0; // Not really well defined
- }
- invCovMatr.add(m.inverse());
+ covarianceMatrices[i] = m;
+ normDistrFactor[i] = 1.0 / Math.sqrt(norm);
+ invCovMatr[i] = Matrix.identity(dimensionality, dimensionality);
clusterWeights[i] = 1.0 / k;
- if (LOG.isDebuggingFinest()) {
- StringBuilder msg = new StringBuilder();
- msg.append(" model ").append(i).append(":\n");
- msg.append(" mean: ").append(means.get(i)).append('\n');
- msg.append(" m:\n").append(FormatUtil.format(m, " ")).append('\n');
- msg.append(" m.det(): ").append(det).append('\n');
- msg.append(" cluster weight: ").append(clusterWeights[i]).append('\n');
- msg.append(" normDistFact: ").append(normDistrFactor[i]).append('\n');
- LOG.debugFine(msg.toString());
- }
}
double emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
// iteration unless no change
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("iterating EM");
}
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("iteration " + 0 + " - expectation value: " + emNew);
}
- double em;
- for (int it = 1; it <= maxiter || maxiter < 0; it++) {
- em = emNew;
-
- // recompute models
- List<Vector> meanSums = new ArrayList<>(k);
- double[] sumOfClusterProbabilities = new double[k];
-
- for (int i = 0; i < k; i++) {
- clusterWeights[i] = 0.0;
- meanSums.add(new Vector(dimensionality));
- covarianceMatrices.set(i, Matrix.zeroMatrix(dimensionality));
- }
-
- // weights and means
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- double[] clusterProbabilities = probClusterIGivenX.get(iditer);
-
- for (int i = 0; i < k; i++) {
- sumOfClusterProbabilities[i] += clusterProbabilities[i];
- Vector summand = relation.get(iditer).getColumnVector().timesEquals(clusterProbabilities[i]);
- meanSums.get(i).plusEquals(summand);
- }
- }
- final int n = relation.size();
- for (int i = 0; i < k; i++) {
- clusterWeights[i] = sumOfClusterProbabilities[i] / n;
- Vector newMean = meanSums.get(i).timesEquals(1 / sumOfClusterProbabilities[i]);
- means.set(i, newMean);
- }
- // covariance matrices
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- double[] clusterProbabilities = probClusterIGivenX.get(iditer);
- Vector instance = relation.get(iditer).getColumnVector();
- for (int i = 0; i < k; i++) {
- Vector difference = instance.minus(means.get(i));
- covarianceMatrices.get(i).plusEquals(difference.timesTranspose(difference).timesEquals(clusterProbabilities[i]));
- }
- }
- for (int i = 0; i < k; i++) {
- covarianceMatrices.set(i, covarianceMatrices.get(i).times(1 / sumOfClusterProbabilities[i]).cheatToAvoidSingularity(SINGULARITY_CHEAT));
- }
- for (int i = 0; i < k; i++) {
- final double det = covarianceMatrices.get(i).det();
- if (det > 0.) {
- normDistrFactor[i] = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, dimensionality) * det);
- } else {
- LOG.warning("Encountered matrix with 0 determinant - degenerated.");
- normDistrFactor[i] = 1.0; // Not really well defined
- }
- invCovMatr.set(i, covarianceMatrices.get(i).inverse());
- }
+ for(int it = 1; it <= maxiter || maxiter < 0; it++) {
+ final double emOld = emNew;
+ recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dimensionality);
+ computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
// reassign probabilities
emNew = assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
- if (Math.abs(em - emNew) <= delta) {
+ if(Math.abs(emOld - emNew) <= delta) {
break;
}
}
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("assigning clusters");
}
// fill result with clusters and models
List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
hardClusters.add(DBIDUtil.newHashSet());
}
// provide a hard clustering
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double[] clusterProbabilities = probClusterIGivenX.get(iditer);
int maxIndex = 0;
double currentMax = 0.0;
- for (int i = 0; i < k; i++) {
- if (clusterProbabilities[i] > currentMax) {
+ for(int i = 0; i < k; i++) {
+ if(clusterProbabilities[i] > currentMax) {
maxIndex = i;
currentMax = clusterProbabilities[i];
}
@@ -312,24 +246,89 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Clustering<EMModel<V>> result = new Clustering<>("EM Clustering", "em-clustering");
// provide models within the result
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
// TODO: re-do labeling.
// SimpleClassLabel label = new SimpleClassLabel();
// label.init(result.canonicalClusterLabel(i));
- Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means.get(i).getArrayRef()), covarianceMatrices.get(i)));
+ Cluster<EMModel<V>> model = new Cluster<>(hardClusters.get(i), new EMModel<>(factory.newNumberVector(means[i].getArrayRef()), covarianceMatrices[i]));
result.addToplevelCluster(model);
}
+ if(isSoft()) {
+ result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
+ }
+ else {
+ probClusterIGivenX.destroy();
+ }
return result;
}
/**
+ * Compute the inverse cluster matrices.
+ *
+ * @param covarianceMatrices Input covariance matrices
+ * @param invCovMatr Output array for inverse matrices
+ * @param normDistrFactor Output array for norm distribution factors.
+ * @param norm Normalization factor, usually (2pi)^d
+ */
+ public static void computeInverseMatrixes(Matrix[] covarianceMatrices, Matrix[] invCovMatr, double[] normDistrFactor, final double norm) {
+ int k = covarianceMatrices.length;
+ for(int i = 0; i < k; i++) {
+ final double det = covarianceMatrices[i].det();
+ if(det > 0.) {
+ normDistrFactor[i] = 1. / Math.sqrt(norm * det);
+ }
+ else {
+ LOG.warning("Encountered matrix with 0 determinant - degenerated.");
+ normDistrFactor[i] = 1.; // Not really well defined
+ }
+ invCovMatr[i] = covarianceMatrices[i].inverse();
+ }
+ }
+
+ /**
+ * Recompute the covariance matrixes.
+ *
+ * @param relation Vector data
+ * @param probClusterIGivenX Object probabilities
+ * @param means Cluster means output
+ * @param covarianceMatrices Output covariance matrixes
+ * @param dimensionality Data set dimensionality
+ */
+ public static void recomputeCovarianceMatrices(Relation<? extends NumberVector<?>> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] covarianceMatrices, final int dimensionality) {
+ final int k = means.length;
+ CovarianceMatrix[] cms = new CovarianceMatrix[k];
+ for(int i = 0; i < k; i++) {
+ cms[i] = new CovarianceMatrix(dimensionality);
+ }
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double[] clusterProbabilities = probClusterIGivenX.get(iditer);
+ Vector instance = relation.get(iditer).getColumnVector();
+ for(int i = 0; i < k; i++) {
+ if(clusterProbabilities[i] > 0.) {
+ cms[i].put(instance, clusterProbabilities[i]);
+ }
+ }
+ }
+ for(int i = 0; i < k; i++) {
+ if(cms[i].getWeight() <= 0.) {
+ means[i] = new Vector(dimensionality);
+ covarianceMatrices[i] = Matrix.identity(dimensionality, dimensionality);
+ }
+ else {
+ means[i] = cms[i].getMeanVector();
+ covarianceMatrices[i] = cms[i].destroyToNaiveMatrix().cheatToAvoidSingularity(SINGULARITY_CHEAT);
+ }
+ }
+ }
+
+ /**
* Assigns the current probability values to the instances in the database and
* compute the expectation value of the current mixture of distributions.
*
* Computed as the sum of the logarithms of the prior probability of each
* instance.
*
- * @param database the database used for assignment to instances
+ * @param relation the database used for assignment to instances
* @param normDistrFactor normalization factor for density function, based on
* current covariance matrix
* @param means the current means
@@ -337,58 +336,55 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @param clusterWeights the weights of the current clusters
* @return the expectation value of the current mixture of distributions
*/
- protected double assignProbabilitiesToInstances(Relation<V> database, double[] normDistrFactor, List<Vector> means, List<Matrix> invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
- double emSum = 0.0;
+ public static double assignProbabilitiesToInstances(Relation<? extends NumberVector<?>> relation, double[] normDistrFactor, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, WritableDataStore<double[]> probClusterIGivenX) {
+ final int k = clusterWeights.length;
+ double emSum = 0.;
- for (DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
- Vector x = database.get(iditer).getColumnVector();
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ Vector x = relation.get(iditer).getColumnVector();
double[] probabilities = new double[k];
- for (int i = 0; i < k; i++) {
- Vector difference = x.minus(means.get(i));
- double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr.get(i), difference);
- double power = rowTimesCovTimesCol / 2.0;
+ for(int i = 0; i < k; i++) {
+ Vector difference = x.minus(means[i]);
+ double rowTimesCovTimesCol = difference.transposeTimesTimes(invCovMatr[i], difference);
+ double power = rowTimesCovTimesCol / 2.;
double prob = normDistrFactor[i] * Math.exp(-power);
- if (LOG.isDebuggingFinest()) {
- LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + " difference:\n" + FormatUtil.format(difference, " ") + "\n" + " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + " power= " + power + "\n" + " prob=" + prob + "\n" + " inv cov matrix: \n" + FormatUtil.format(invCovMatr.get(i), " "));
+ if(LOG.isDebuggingFinest()) {
+ LOG.debugFinest(" difference vector= ( " + difference.toString() + " )\n" + //
+ " difference:\n" + FormatUtil.format(difference, " ") + "\n" + //
+ " rowTimesCovTimesCol:\n" + rowTimesCovTimesCol + "\n" + //
+ " power= " + power + "\n" + " prob=" + prob + "\n" + //
+ " inv cov matrix: \n" + FormatUtil.format(invCovMatr[i], " "));
}
- if (!(prob >= 0.)) {
+ if(!(prob >= 0.)) {
LOG.warning("Invalid probability: " + prob + " power: " + power + " factor: " + normDistrFactor[i]);
+ prob = 0.;
}
probabilities[i] = prob;
}
- double priorProbability = 0.0;
- for (int i = 0; i < k; i++) {
+ double priorProbability = 0.;
+ for(int i = 0; i < k; i++) {
priorProbability += probabilities[i] * clusterWeights[i];
}
double logP = Math.max(Math.log(priorProbability), MIN_LOGLIKELIHOOD);
- if (!Double.isNaN(logP)) {
+ if(!Double.isNaN(logP)) {
emSum += logP;
}
double[] clusterProbabilities = new double[k];
- for (int i = 0; i < k; i++) {
- assert (clusterWeights[i] >= 0.0);
+ for(int i = 0; i < k; i++) {
+ assert (clusterWeights[i] >= 0.);
// do not divide by zero!
- if (priorProbability > 0.0) {
+ if(priorProbability > 0.) {
clusterProbabilities[i] = probabilities[i] / priorProbability * clusterWeights[i];
- } else {
- clusterProbabilities[i] = 0.0;
+ }
+ else {
+ clusterProbabilities[i] = 0.;
}
}
probClusterIGivenX.put(iditer, clusterProbabilities);
}
- return emSum;
- }
-
- /**
- * Get the probabilities for a given point.
- *
- * @param index Point ID
- * @return Probabilities of given point
- */
- public double[] getProbClusterIGivenX(DBIDRef index) {
- return probClusterIGivenX.get(index);
+ return emSum / relation.size();
}
@Override
@@ -402,6 +398,20 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
}
/**
+ * @return the soft
+ */
+ public boolean isSoft() {
+ return soft;
+ }
+
+ /**
+ * @param soft the soft to set
+ */
+ public void setSoft(boolean soft) {
+ this.soft = soft;
+ }
+
+ /**
* Parameterization class.
*
* @author Erich Schubert
@@ -409,45 +419,77 @@ public class EM<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<
* @apiviz.exclude
*/
public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the number of clusters to find, must be an integer
+ * greater than 0.
+ */
+ public static final OptionID K_ID = new OptionID("em.k", "The number of clusters to find.");
+
+ /**
+ * Parameter to specify the termination criterion for maximization of E(M):
+ * E(M) - E(M') < em.delta, must be a double equal to or greater than 0.
+ */
+ public static final OptionID DELTA_ID = new OptionID("em.delta", //
+ "The termination criterion for maximization of E(M): " + //
+ "E(M) - E(M') < em.delta");
+
+ /**
+ * Parameter to specify the initialization method
+ */
+ public static final OptionID INIT_ID = new OptionID("kmeans.initialization", //
+ "Method to choose the initial means.");
+
+ /**
+ * Number of clusters.
+ */
protected int k;
+ /**
+ * Stopping threshold
+ */
protected double delta;
+ /**
+ * Initialization method
+ */
protected KMeansInitialization<V> initializer;
+ /**
+ * Maximum number of iterations.
+ */
protected int maxiter = -1;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyGeneratedInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, 0.0);
- deltaP.addConstraint(new GreaterEqualConstraint(0.0));
- if (config.grab(deltaP)) {
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(deltaP)) {
delta = deltaP.getValue();
}
IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
maxiterP.setOptional(true);
- if (config.grab(maxiterP)) {
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.getValue();
}
}
@Override
protected EM<V> makeInstance() {
- return new EM<>(k, delta, initializer, maxiter);
+ return new EM<>(k, delta, initializer, maxiter, false);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
index e928d041..a4a922df 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICS.java
@@ -33,10 +33,10 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter;
-import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.DistanceUtil;
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -146,7 +146,8 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
// boxing/unboxing.
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if(!processedIDs.contains(iditer)) {
- // We need to do some ugly casts to be able to run the optimized version, unfortunately.
+ // We need to do some ugly casts to be able to run the optimized
+ // version, unfortunately.
@SuppressWarnings("unchecked")
final ClusterOrderResult<DoubleDistance> doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder);
@SuppressWarnings("unchecked")
@@ -304,7 +305,7 @@ public class OPTICS<O, D extends Distance<D>> extends AbstractDistanceBasedAlgor
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
index 583d402b..db343f3a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/OPTICSXi.java
@@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry;
import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
@@ -240,6 +239,10 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm<
// By default, clusters cover both the steep up and steep down area
int cstart = sda.getStartIndex();
int cend = sua.getEndIndex();
+ // Hotfix: never include infinity-reachable points at the end
+ while(cend > cstart && Double.isInfinite(clusterOrder.get(cend).getReachability().doubleValue())) {
+ --cend;
+ }
// However, we sometimes have to adjust this (Condition 4):
{
// Case b)
@@ -654,8 +657,8 @@ public class OPTICSXi<N extends NumberDistance<N, ?>> extends AbstractAlgorithm<
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter xiP = new DoubleParameter(XI_ID);
- xiP.addConstraint(new GreaterEqualConstraint(0.0));
- xiP.addConstraint(new LessConstraint(1.0));
+ xiP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ xiP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
if(config.grab(xiP)) {
xi = xiP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
index 95d9f23c..86bb9a09 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
@@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -328,7 +328,7 @@ public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> imple
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(minptsP)) {
minpts = minptsP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java
new file mode 100644
index 00000000..68dacf34
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationClusteringAlgorithm.java
@@ -0,0 +1,350 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import gnu.trove.iterator.TIntObjectIterator;
+import gnu.trove.map.hash.TIntObjectHashMap;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Cluster analysis by affinity propagation.
+ *
+ * Reference:
+ * <p>
+ * Clustering by Passing Messages Between Data Points<br />
+ * B. J. Frey and D. Dueck<br />
+ * Science Vol 315
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.composedOf AffinityPropagationInitialization
+ *
+ * @param <O> object type
+ */
+@Title("Affinity Propagation: Clustering by Passing Messages Between Data Points")
+@Reference(title = "Clustering by Passing Messages Between Data Points", authors = "B. J. Frey and D. Dueck", booktitle = "Science Vol 315", url = "http://dx.doi.org/10.1126/science.1136800")
+public class AffinityPropagationClusteringAlgorithm<O> extends AbstractAlgorithm<Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * Class logger
+ */
+ private static final Logging LOG = Logging.getLogger(AffinityPropagationClusteringAlgorithm.class);
+
+ /**
+ * Similarity initialization
+ */
+ AffinityPropagationInitialization<O> initialization;
+
+ /**
+ * Damping factor lambda.
+ */
+ double lambda = 0.5;
+
+ /**
+ * Terminate after 10 iterations with no changes.
+ */
+ int convergence = 10;
+
+ /**
+ * Maximum number of iterations.
+ */
+ int maxiter = 1000;
+
+ /**
+ * Constructor.
+ *
+ * @param initialization Similarity initialization
+ * @param lambda Damping factor
+ * @param convergence Termination threshold (Number of stable iterations)
+ * @param maxiter Maximum number of iterations
+ */
+ public AffinityPropagationClusteringAlgorithm(AffinityPropagationInitialization<O> initialization, double lambda, int convergence, int maxiter) {
+ super();
+ this.initialization = initialization;
+ this.lambda = lambda;
+ this.convergence = convergence;
+ this.maxiter = maxiter;
+ }
+
+ /**
+ * Perform affinity propagation clustering.
+ *
+ * @param db Database
+ * @param relation Relation
+ * @return Clustering result
+ */
+ public Clustering<MedoidModel> run(Database db, Relation<O> relation) {
+ ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
+ final int size = ids.size();
+
+ int[] assignment = new int[size];
+ double[][] s = initialization.getSimilarityMatrix(db, relation, ids);
+ double[][] r = new double[size][size];
+ double[][] a = new double[size][size];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null;
+ MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null;
+
+ int inactive = 0;
+ for(int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) {
+ // Update responsibility matrix:
+ for(int i = 0; i < size; i++) {
+ double[] ai = a[i], ri = r[i], si = s[i];
+ // Find the two largest values (as initially maxk == i)
+ double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY;
+ int maxk = -1;
+ for(int k = 0; k < size; k++) {
+ double val = ai[k] + si[k];
+ if(val > max1) {
+ max2 = max1;
+ max1 = val;
+ maxk = k;
+ }
+ else if(val > max2) {
+ max2 = val;
+ }
+ }
+ // With the maximum value known, update r:
+ for(int k = 0; k < size; k++) {
+ double val = si[k] - ((k != maxk) ? max1 : max2);
+ ri[k] = ri[k] * lambda + val * (1. - lambda);
+ }
+ }
+ // Update availability matrix
+ for(int k = 0; k < size; k++) {
+ // Compute sum of max(0, r_ik) for all i.
+ // For r_kk, don't apply the max.
+ double colposum = 0.;
+ for(int i = 0; i < size; i++) {
+ if(i == k || r[i][k] > 0.) {
+ colposum += r[i][k];
+ }
+ }
+ for(int i = 0; i < size; i++) {
+ double val = colposum;
+ // Adjust column sum by the one extra term.
+ if(i == k || r[i][k] > 0.) {
+ val -= r[i][k];
+ }
+ if(i != k && val > 0.) { // min
+ val = 0.;
+ }
+ a[i][k] = a[i][k] * lambda + val * (1 - lambda);
+ }
+ }
+ int changed = 0;
+ for(int i = 0; i < size; i++) {
+ double[] ai = a[i], ri = r[i];
+ double max = Double.NEGATIVE_INFINITY;
+ int maxj = -1;
+ for(int j = 0; j < size; j++) {
+ double v = ai[j] + ri[j];
+ if(v > max || (i == j && v >= max)) {
+ max = v;
+ maxj = j;
+ }
+ }
+ if(assignment[i] != maxj) {
+ changed += 1;
+ assignment[i] = maxj;
+ }
+ }
+ inactive = (changed > 0) ? 0 : (inactive + 1);
+ if(prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ if(aprog != null) {
+ aprog.setProcessed(size - changed, LOG);
+ }
+ }
+ if(aprog != null) {
+ aprog.setProcessed(aprog.getTotal(), LOG);
+ }
+ if(prog != null) {
+ prog.setCompleted(LOG);
+ }
+ // Cluster map, by lead object
+ TIntObjectHashMap<ModifiableDBIDs> map = new TIntObjectHashMap<>();
+ DBIDArrayIter i1 = ids.iter();
+ for(int i = 0; i1.valid(); i1.advance(), i++) {
+ int c = assignment[i];
+ // Add to cluster members:
+ ModifiableDBIDs cids = map.get(c);
+ if(cids == null) {
+ cids = DBIDUtil.newArray();
+ map.put(c, cids);
+ }
+ cids.add(i1);
+ }
+ // If we stopped early, the cluster lead might be in a different cluster.
+ for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) {
+ iter.advance(); // Trove iterator; advance first!
+ final int key = iter.key();
+ int targetkey = key;
+ ModifiableDBIDs tids = null;
+ // Chase arrows:
+ while(ids == null && assignment[targetkey] != targetkey) {
+ targetkey = assignment[targetkey];
+ tids = map.get(targetkey);
+ }
+ if(tids != null && targetkey != key) {
+ tids.addDBIDs(iter.value());
+ iter.remove();
+ }
+ }
+
+ Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering");
+ ModifiableDBIDs noise = DBIDUtil.newArray();
+ for(TIntObjectIterator<ModifiableDBIDs> iter = map.iterator(); iter.hasNext();) {
+ iter.advance(); // Trove iterator; advance first!
+ i1.seek(iter.key());
+ if(iter.value().size() > 1) {
+ MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1));
+ clustering.addToplevelCluster(new Cluster<>(iter.value(), mod));
+ }
+ else {
+ noise.add(i1);
+ }
+ }
+ if(noise.size() > 0) {
+ MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter()));
+ clustering.addToplevelCluster(new Cluster<>(noise, true, mod));
+ }
+ return clustering;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(initialization.getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <O> object type
+ */
+ public static class Parameterizer<O> extends AbstractParameterizer {
+ /**
+ * Parameter for the similarity matrix initialization
+ */
+ public static final OptionID INITIALIZATION_ID = new OptionID("ap.initialization", "Similarity matrix initialization..");
+
+ /**
+ * Parameter for the dampening factor.
+ */
+ public static final OptionID LAMBDA_ID = new OptionID("ap.lambda", "Dampening factor lambda. Usually 0.5 to 1.");
+
+ /**
+ * Parameter for the convergence factor.
+ */
+ public static final OptionID CONVERGENCE_ID = new OptionID("ap.convergence", "Number of stable iterations for convergence.");
+
+ /**
+ * Parameter for the convergence factor.
+ */
+ public static final OptionID MAXITER_ID = new OptionID("ap.maxiter", "Maximum number of iterations.");
+
+ /**
+ * Initialization function for the similarity matrix.
+ */
+ AffinityPropagationInitialization<O> initialization;
+
+ /**
+ * Dampening parameter.
+ */
+ double lambda = .5;
+
+ /**
+ * Number of stable iterations for convergence.
+ */
+ int convergence;
+
+ /**
+ * Maximum number of iterations.
+ */
+ int maxiter;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final ObjectParameter<AffinityPropagationInitialization<O>> param = new ObjectParameter<>(INITIALIZATION_ID, AffinityPropagationInitialization.class, DistanceBasedInitializationWithMedian.class);
+ if(config.grab(param)) {
+ initialization = param.instantiateClass(config);
+ }
+ final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, .5);
+ lambdaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ lambdaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(lambdaP)) {
+ lambda = lambdaP.doubleValue();
+ }
+ final IntParameter convergenceP = new IntParameter(CONVERGENCE_ID, 15);
+ convergenceP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(convergenceP)) {
+ convergence = convergenceP.intValue();
+ }
+ final IntParameter maxiterP = new IntParameter(MAXITER_ID, 1000);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.intValue();
+ }
+ }
+
+ @Override
+ protected AffinityPropagationClusteringAlgorithm<O> makeInstance() {
+ return new AffinityPropagationClusteringAlgorithm<>(initialization, lambda, convergence, maxiter);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java
new file mode 100644
index 00000000..5dbc54de
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/AffinityPropagationInitialization.java
@@ -0,0 +1,59 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
+
+/**
+ * Initialization methods for affinity propagation.
+ *
+ * @author Erich Schubert
+ */
+public interface AffinityPropagationInitialization<O> extends Parameterizable {
+ /**
+ * Quantile to use for the diagonal entries.
+ */
+ public static final OptionID QUANTILE_ID = new OptionID("ap.quantile", "Quantile to use for diagonal entries.");
+
+ /**
+ * Compute the initial similarity matrix.
+ *
+ * @param db Database
+ * @param relation Data relation
+ * @param ids indexed DBIDs
+ * @return Similarity matrix
+ */
+ double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids);
+
+ /**
+ * Get the data type information for the similarity computations.
+ *
+ * @return Data type
+ */
+ TypeInformation getInputTypeRestriction();
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java
new file mode 100644
index 00000000..2c8cabf9
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/DistanceBasedInitializationWithMedian.java
@@ -0,0 +1,148 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Distance based initialization.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+public class DistanceBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> {
+ /**
+ * Distance function.
+ */
+ DistanceFunction<? super O, D> distance;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ /**
+ * Constructor.
+ *
+ * @param distance Similarity function
+ * @param quantile Quantile
+ */
+ public DistanceBasedInitializationWithMedian(DistanceFunction<? super O, D> distance, double quantile) {
+ super();
+ this.distance = distance;
+ this.quantile = quantile;
+ }
+
+ @Override
+ public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) {
+ final int size = ids.size();
+ DistanceQuery<O, D> dq = db.getDistanceQuery(relation, distance);
+ double[][] mat = new double[size][size];
+ double[] flat = new double[(size * (size - 1)) >> 1];
+ // TODO: optimize for double valued primitive distances.
+ DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
+ for (int i = 0, j = 0; i < size; i++, i1.advance()) {
+ double[] mati = mat[i];
+ i2.seek(i + 1);
+ for (int k = i + 1; k < size; k++, i2.advance()) {
+ mati[k] = -dq.distance(i1, i2).doubleValue();
+ mat[k][i] = mati[k]; // symmetry.
+ flat[j] = mati[k];
+ j++;
+ }
+ }
+ double median = QuickSelect.quantile(flat, quantile);
+ // On the diagonal, we place the median
+ for (int i = 0; i < size; i++) {
+ mat[i][i] = median;
+ }
+ return mat;
+ }
+
+ @Override
+ public TypeInformation getInputTypeRestriction() {
+ return distance.getInputTypeRestriction();
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the distance function.
+ */
+ public static final OptionID DISTANCE_ID = new OptionID("ap.distance", "Distance function to use.");
+
+ /**
+ * istance function.
+ */
+ DistanceFunction<? super O, D> distance;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<DistanceFunction<? super O, D>> param = new ObjectParameter<>(DISTANCE_ID, DistanceFunction.class, SquaredEuclideanDistanceFunction.class);
+ if (config.grab(param)) {
+ distance = param.instantiateClass(config);
+ }
+
+ DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5);
+ if (config.grab(quantileP)) {
+ quantile = quantileP.doubleValue();
+ }
+ }
+
+ @Override
+ protected DistanceBasedInitializationWithMedian<O, D> makeInstance() {
+ return new DistanceBasedInitializationWithMedian<>(distance, quantile);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java
new file mode 100644
index 00000000..a138da96
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/SimilarityBasedInitializationWithMedian.java
@@ -0,0 +1,153 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.LinearKernelFunction;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Similarity based initialization.
+ *
+ * @author Erich Schubert
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+public class SimilarityBasedInitializationWithMedian<O, D extends NumberDistance<D, ?>> implements AffinityPropagationInitialization<O> {
+ /**
+ * Similarity function.
+ */
+ SimilarityFunction<? super O, D> similarity;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ /**
+ * Constructor.
+ *
+ * @param similarity Similarity function
+ * @param quantile Quantile
+ */
+ public SimilarityBasedInitializationWithMedian(SimilarityFunction<? super O, D> similarity, double quantile) {
+ super();
+ this.similarity = similarity;
+ this.quantile = quantile;
+ }
+
+ @Override
+ public double[][] getSimilarityMatrix(Database db, Relation<O> relation, ArrayDBIDs ids) {
+ final int size = ids.size();
+ SimilarityQuery<O, D> sq = db.getSimilarityQuery(relation, similarity);
+ double[][] mat = new double[size][size];
+ double[] flat = new double[(size * (size - 1)) >> 1];
+ // TODO: optimize for double valued primitive distances.
+ DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
+ // Compute self-similarities first, for centering:
+ for (int i = 0; i < size; i++, i1.advance()) {
+ mat[i][i] = sq.similarity(i1, i1).doubleValue() * .5;
+ }
+ i1.seek(0);
+ for (int i = 0, j = 0; i < size; i++, i1.advance()) {
+ final double[] mati = mat[i]; // Probably faster access.
+ i2.seek(i + 1);
+ for (int k = i + 1; k < size; k++, i2.advance()) {
+ mati[k] = sq.similarity(i1, i2).doubleValue() - mati[i] - mat[k][k];
+ mat[k][i] = mati[k]; // symmetry.
+ flat[j] = mati[k];
+ j++;
+ }
+ }
+ double median = QuickSelect.quantile(flat, quantile);
+ // On the diagonal, we place the median
+ for (int i = 0; i < size; i++) {
+ mat[i][i] = median;
+ }
+ return mat;
+ }
+
+ @Override
+ public TypeInformation getInputTypeRestriction() {
+ return similarity.getInputTypeRestriction();
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <O> Object type
+ * @param <D> Distance type
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the similarity function.
+ */
+ public static final OptionID SIMILARITY_ID = new OptionID("ap.similarity", "Similarity function to use.");
+
+ /**
+ * Similarity function.
+ */
+ SimilarityFunction<? super O, D> similarity;
+
+ /**
+ * Quantile to use.
+ */
+ double quantile;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ ObjectParameter<SimilarityFunction<? super O, D>> param = new ObjectParameter<>(SIMILARITY_ID, SimilarityFunction.class, LinearKernelFunction.class);
+ if (config.grab(param)) {
+ similarity = param.instantiateClass(config);
+ }
+
+ DoubleParameter quantileP = new DoubleParameter(QUANTILE_ID, .5);
+ if (config.grab(quantileP)) {
+ quantile = quantileP.doubleValue();
+ }
+ }
+
+ @Override
+ protected SimilarityBasedInitializationWithMedian<O, D> makeInstance() {
+ return new SimilarityBasedInitializationWithMedian<>(similarity, quantile);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java
new file mode 100644
index 00000000..bc6059ac
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/affinitypropagation/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Affinity Propagation (AP) clustering.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.affinitypropagation; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java
new file mode 100644
index 00000000..8b875340
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/AbstractBiclustering.java
@@ -0,0 +1,302 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.BiclusterModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
+
+/**
+ * Abstract class as a convenience for different biclustering approaches.
+ * <p/>
+ * The typically required values describing submatrices are computed using the
+ * corresponding values within a database of NumberVectors.
+ * <p/>
+ * The database is supposed to present a data matrix with a row representing an
+ * entry ({@link NumberVector}), a column representing a dimension (attribute)
+ * of the {@link NumberVector}s.
+ *
+ * @author Arthur Zimek
+ * @param <V> a certain subtype of NumberVector - the data matrix is supposed to
+ * consist of rows where each row relates to an object of type V and the
+ * columns relate to the attribute values of these objects
+ * @param <M> Cluster model type
+ */
+public abstract class AbstractBiclustering<V extends NumberVector<?>, M extends BiclusterModel> extends AbstractAlgorithm<Clustering<M>> implements ClusteringAlgorithm<Clustering<M>> {
+ /**
+ * Keeps the currently set database.
+ */
+ private Database database;
+
+ /**
+ * Relation we use.
+ */
+ protected Relation<V> relation;
+
+ /**
+ * Iterator to use for more efficient random access.
+ */
+ private DBIDArrayIter iter;
+
+ /**
+ * The row ids corresponding to the currently set {@link #relation}.
+ */
+ protected ArrayDBIDs rowIDs;
+
+ /**
+ * Column dimensionality.
+ */
+ private int colDim;
+
+ /**
+ * Constructor.
+ */
+ protected AbstractBiclustering() {
+ super();
+ }
+
+ /**
+ * Prepares the algorithm for running on a specific database.
+ * <p/>
+ * Assigns the database, the row ids, and the col ids, then calls
+ * {@link #biclustering()}.
+ * <p/>
+ * Any concrete algorithm should be implemented within method
+ * {@link #biclustering()} by an inheriting biclustering approach.
+ *
+ * @param relation Relation to process
+ * @return Clustering result
+ */
+ public final Clustering<M> run(Relation<V> relation) {
+ this.relation = relation;
+ if (this.relation == null || this.relation.size() == 0) {
+ throw new IllegalArgumentException(ExceptionMessages.DATABASE_EMPTY);
+ }
+ colDim = RelationUtil.dimensionality(relation);
+ rowIDs = DBIDUtil.ensureArray(this.relation.getDBIDs());
+ iter = rowIDs.iter();
+ return biclustering();
+ }
+
+ /**
+ * Run the actual biclustering algorithm.
+ * <p/>
+ * This method is supposed to be called only from the method
+ * {@link #run}.
+ * <p/>
+ */
+ protected abstract Clustering<M> biclustering();
+
+ /**
+ * Convert a bitset into integer column ids.
+ *
+ * @param cols
+ * @return integer column ids
+ */
+ protected int[] colsBitsetToIDs(BitSet cols) {
+ int[] colIDs = new int[cols.cardinality()];
+ int colsIndex = 0;
+ for (int i = cols.nextSetBit(0); i >= 0; i = cols.nextSetBit(i + 1)) {
+ colIDs[colsIndex] = i;
+ colsIndex++;
+ }
+ return colIDs;
+ }
+
+ /**
+ * Convert a bitset into integer row ids.
+ *
+ * @param rows
+ * @return integer row ids
+ */
+ protected ArrayDBIDs rowsBitsetToIDs(BitSet rows) {
+ ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray(rows.cardinality());
+ DBIDArrayIter iter = this.rowIDs.iter();
+ for (int i = rows.nextSetBit(0); i >= 0; i = rows.nextSetBit(i + 1)) {
+ iter.seek(i);
+ rowIDs.add(iter);
+ }
+ return rowIDs;
+ }
+
+ /**
+ * Defines a Bicluster as given by the included rows and columns.
+ *
+ * @param rows the rows included in the Bicluster
+ * @param cols the columns included in the Bicluster
+ * @return a Bicluster as given by the included rows and columns
+ */
+ protected Cluster<BiclusterModel> defineBicluster(BitSet rows, BitSet cols) {
+ ArrayDBIDs rowIDs = rowsBitsetToIDs(rows);
+ int[] colIDs = colsBitsetToIDs(cols);
+ return new Cluster<>(rowIDs, new BiclusterModel(colIDs));
+ }
+
+ /**
+ * Defines a Bicluster as given by the included rows and columns.
+ *
+ * @param rows the rows included in the Bicluster
+ * @param cols the columns included in the Bicluster
+ * @return A Bicluster as given by the included rows and columns
+ */
+ protected Cluster<BiclusterModel> defineBicluster(long[] rows, long[] cols) {
+ ArrayDBIDs rowIDs = rowsBitsetToIDs(rows);
+ int[] colIDs = colsBitsetToIDs(cols);
+ return new Cluster<>(rowIDs, new BiclusterModel(colIDs));
+ }
+
+ /**
+ * Returns the value of the data matrix at row <code>row</code> and column
+ * <code>col</code>.
+ *
+ * @param row the row in the data matrix according to the current order of
+ * rows (refers to database entry
+ * <code>database.get(rowIDs[row])</code>)
+ * @param col the column in the data matrix according to the current order of
+ * rows (refers to the attribute value of an database entry
+ * <code>getValue(colIDs[col])</code>)
+ * @return the attribute value of the database entry as retrieved by
+ * <code>database.get(rowIDs[row]).getValue(colIDs[col])</code>
+ */
+ protected double valueAt(int row, int col) {
+ iter.seek(row);
+ return relation.get(iter).doubleValue(col);
+ }
+
+ /**
+ * Get the DBID of a certain row
+ *
+ * @param row Row number
+ * @return DBID of this row
+ * @deprecated Expensive!
+ */
+ @Deprecated
+ protected DBID getRowDBID(int row) {
+ return rowIDs.get(row);
+ }
+
+ /**
+ * Convert a bitset into integer column ids.
+ *
+ * @param cols
+ * @return integer column ids
+ */
+ protected int[] colsBitsetToIDs(long[] cols) {
+ int[] colIDs = new int[(int) BitsUtil.cardinality(cols)];
+ int colsIndex = 0;
+ for (int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ if (clong == 0L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for (int j = 0; j < Long.SIZE; ++j, ++cpos, clong >>>= 1) {
+ if ((clong & 1L) == 1L) {
+ colIDs[colsIndex] = cpos;
+ ++colsIndex;
+ }
+ }
+ }
+ return colIDs;
+ }
+
+ /**
+ * Convert a bitset into integer row ids.
+ *
+ * @param rows
+ * @return integer row ids
+ */
+ protected ArrayDBIDs rowsBitsetToIDs(long[] rows) {
+ ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray((int) BitsUtil.cardinality(rows));
+ DBIDArrayIter iter = this.rowIDs.iter();
+ outer: for (int rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if (rlong == 0L) {
+ iter.advance(Long.SIZE);
+ continue;
+ }
+ for (int i = 0; i < Long.SIZE; ++i, rlong >>>= 1, iter.advance()) {
+ if (!iter.valid()) {
+ break outer;
+ }
+ if ((rlong & 1L) == 1L) {
+ rowIDs.add(iter);
+ }
+ }
+ }
+ return rowIDs;
+ }
+
+ /**
+ * Provides the number of rows of the data matrix.
+ *
+ * @return the number of rows of the data matrix
+ */
+ protected int getRowDim() {
+ return this.rowIDs.size();
+ }
+
+ /**
+ * Provides the number of columns of the data matrix.
+ *
+ * @return the number of columns of the data matrix
+ */
+ protected int getColDim() {
+ return colDim;
+ }
+
+ /**
+ * Getter for database.
+ *
+ * @return database
+ */
+ public Database getDatabase() {
+ return database;
+ }
+
+ /**
+ * Getter for the relation.
+ *
+ * @return relation
+ */
+ public Relation<V> getRelation() {
+ return relation;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
new file mode 100644
index 00000000..e110faff
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
@@ -0,0 +1,900 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Perform Cheng and Church biclustering.
+ *
+ * <p>
+ * Reference: <br>
+ * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of
+ * the 8th International Conference on Intelligent Systems for Molecular Biology
+ * (ISMB), San Diego, CA, 2000.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @param <V> Vector type.
+ */
+@Reference(authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)")
+public class ChengAndChurch<V extends NumberVector<?>> extends AbstractBiclustering<V, BiclusterWithInversionsModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(ChengAndChurch.class);
+
+ /**
+ * The minimum number of columns that the database must have so that a removal
+ * of columns is performed in {@link #multipleNodeDeletion}.</p>
+ * <p>
+ * Just start deleting multiple columns when more than 100 columns are in the
+ * data matrix.
+ * </p>
+ */
+ private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100;
+
+ /**
+ * The minimum number of rows that the database must have so that a removal of
+ * rows is performed in {@link #multipleNodeDeletion}.
+ * <p>
+ * Just start deleting multiple rows when more than 100 rows are in the data
+ * matrix.
+ * </p>
+ * <!--
+ * <p>
+ * The value is set to 100 as this is not really described in the paper.
+ * </p>
+ * -->
+ */
+ private static final int MIN_ROW_REMOVE_THRESHOLD = 100;
+
+ /**
+ * Threshold for the score.
+ */
+ private double delta;
+
+ /**
+ * The parameter for multiple node deletion.</p>
+ * <p>
+ * It is used to magnify the {@link #delta} value in the
+ * {@link #multipleNodeDeletion} method.
+ * </p>
+ */
+ private double alpha;
+
+ /**
+ * Number of biclusters to be found.
+ */
+ private int n;
+
+ /**
+ * Allow inversion of rows in the last phase.
+ */
+ private boolean useinverted = true;
+
+ /**
+ * Distribution to sample random replacement values from.
+ */
+ private Distribution dist;
+
+ /**
+ * Constructor.
+ *
+ * @param delta Delta parameter: desired quality
+ * @param alpha Alpha parameter: controls switching to single node deletion
+ * approach
+ * @param n Number of clusters to detect
+ * @param dist Distribution of random values to insert
+ */
+ public ChengAndChurch(double delta, double alpha, int n, Distribution dist) {
+ super();
+ this.delta = delta;
+ this.alpha = alpha;
+ this.n = n;
+ this.dist = dist;
+ }
+
+ /**
+ * Visitor pattern for processing cells.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static interface CellVisitor {
+ /** Different modes of operation. */
+ int ALL = 0, SELECTED = 1, NOT_SELECTED = 2;
+
+ /**
+ * Visit a cell.
+ *
+ * @param val Value
+ * @param row Row Number
+ * @param col Column number
+ * @param selrow Boolean, whether row is selected
+ * @param selcol Boolean, whether column is selected
+ * @return Stop flag, return {@code true} to stop visiting
+ */
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol);
+ }
+
+ /**
+ * Bicluster candidate.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ protected static class BiclusterCandidate {
+ /**
+ * Cardinalities.
+ */
+ int rowcard, colcard;
+
+ /**
+ * Means.
+ */
+ double[] rowM, colM;
+
+ /**
+ * Row and column bitmasks.
+ */
+ long[] rows, irow, cols;
+
+ /**
+ * Mean of the current bicluster.
+ */
+ double allM;
+
+ /**
+ * The current bicluster score (mean squared residue).
+ */
+ double residue;
+
+ /**
+ * Constructor.
+ *
+ * @param rows Row dimensionality.
+ * @param cols Column dimensionality.
+ */
+ protected BiclusterCandidate(int rows, int cols) {
+ super();
+ this.rows = BitsUtil.ones(rows);
+ this.irow = BitsUtil.zero(rows);
+ this.rowcard = rows;
+ this.rowM = new double[rows];
+ this.cols = BitsUtil.ones(cols);
+ this.colcard = cols;
+ this.colM = new double[cols];
+ }
+
+ /**
+ * Resets the values for the next cluster search.
+ */
+ protected void reset() {
+ rows = BitsUtil.ones(rowM.length);
+ rowcard = rowM.length;
+ cols = BitsUtil.ones(colM.length);
+ colcard = colM.length;
+ BitsUtil.zeroI(irow);
+ }
+
+ /**
+ * Visit all selected cells in the data matrix.
+ *
+ * @param mat Data matrix
+ * @param mode Operation mode
+ * @param visitor Visitor function
+ */
+ protected void visitAll(double[][] mat, int mode, CellVisitor visitor) {
+ // For efficiency, we manually iterate over the rows and column bitmasks.
+ // This saves repeated shifting needed by the manual bit access.
+ for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
+ boolean rselected = ((rlong & 1L) == 1L);
+ if((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) {
+ continue;
+ }
+ for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ if((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
+ boolean cselected = ((clong & 1L) == 1L);
+ if((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) {
+ continue;
+ }
+ boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Visit a column of the matrix.
+ *
+ * @param mat Data matrix
+ * @param col Column to visit
+ * @param mode Operation mode
+ * @param visitor Visitor function
+ */
+ protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) {
+ boolean cselected = BitsUtil.get(cols, col);
+ // For efficiency, we manually iterate over the rows and column bitmasks.
+ // This saves repeated shifting needed by the manual bit access.
+ for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if(mode == CellVisitor.SELECTED && rlong == 0L) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && rlong == -1L) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
+ boolean rselected = ((rlong & 1L) == 1L);
+ if(mode == CellVisitor.SELECTED && !rselected) {
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && rselected) {
+ continue;
+ }
+ boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+
+ /**
+ * Visit a row of the data matrix.
+ *
+ * @param mat Data matrix
+ * @param row Row to visit
+ * @param visitor Visitor function
+ */
+ protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) {
+ boolean rselected = BitsUtil.get(rows, row);
+ final double[] rowdata = mat[row];
+ for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ // Fast skip blocks of 64 masked values.
+ if(mode == CellVisitor.SELECTED && clong == 0L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && clong == -1L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
+ boolean cselected = ((clong & 1L) == 1L);
+ if(mode == CellVisitor.SELECTED && !cselected) {
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && cselected) {
+ continue;
+ }
+ boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+
+ /** Visitor for updating the means. */
+ private final CellVisitor MEANVISITOR = new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ if(selcol) {
+ rowM[row] += val;
+ }
+ if(selrow) {
+ colM[col] += val;
+ }
+ if(selcol && selrow) {
+ allM += val;
+ }
+ return false;
+ }
+ };
+
+ /**
+ * Update the row means and column means.
+ *
+ * @param mat Data matrix
+ * @param all Flag, to update all
+ * @return overall mean
+ */
+ protected double updateRowAndColumnMeans(final double[][] mat, boolean all) {
+ final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED;
+ Arrays.fill(rowM, 0.);
+ Arrays.fill(colM, 0.);
+ allM = 0.;
+ visitAll(mat, mode, MEANVISITOR);
+ visitColumn(mat, 0, mode, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ rowM[row] /= colcard;
+ return false;
+ }
+ });
+ visitRow(mat, 0, mode, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ colM[col] /= rowcard;
+ return false;
+ }
+ });
+ allM /= colcard * rowcard;
+ return allM;
+ }
+
+ /**
+ * Compute the mean square residue.
+ *
+ * @param mat Data matrix
+ * @return mean squared residue
+ */
+ protected double computeMeanSquaredDeviation(final double[][] mat) {
+ final Mean msr = new Mean();
+ visitAll(mat, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow && selcol);
+ double v = val - rowM[row] - colM[col] + allM;
+ msr.put(v * v);
+ return false;
+ }
+ });
+ residue = msr.getMean();
+ return residue;
+ }
+
+ /**
+ * Computes the <b>mean row residue</b> of the given <code>row</code>.
+ *
+ * @param mat Data matrix
+ * @param row The row who's residue should be computed.
+ * @param rowinverted Indicates if the row should be considered inverted.
+ * @return The row residue of the given <code>row</code>.
+ */
+ protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) {
+ final Mean rowResidue = new Mean();
+ visitRow(mat, row, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ final double rowMean = rowM[row];
+ final double colMean = colM[col];
+ double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM;
+ rowResidue.put(v * v);
+ return false;
+ }
+ });
+ return rowResidue.getMean();
+ }
+
+ /**
+ *
+ * Computes the <b>mean column residue</b> of the given <code>col</code>.
+ *
+ * @param col The column who's residue should be computed.
+ * @return The row residue of the given <code>col</code>um.
+ */
+ protected double computeColResidue(final double[][] mat, final int col) {
+ final double bias = colM[col] - allM;
+ final Mean colResidue = new Mean();
+ visitColumn(mat, col, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ final double rowMean = rowM[row];
+ double v = val - rowMean - bias;
+ colResidue.put(v * v);
+ return false;
+ }
+ });
+ return colResidue.getMean();
+ }
+
+ /**
+ * Updates the mask with replacement values for all data in the given rows
+ * and columns.
+ *
+ * @param mat Mask to update.
+ * @param replacement Distribution to sample replacement values from.
+ */
+ protected void maskMatrix(final double[][] mat, final Distribution replacement) {
+ visitAll(mat, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow && selcol);
+ mat[row][col] = replacement.nextRandom();
+ return false;
+ }
+ });
+ }
+
+ /**
+ * Select or deselect a column.
+ *
+ * @param cnum Column to select
+ * @param set Value to set
+ */
+ protected void selectColumn(int cnum, boolean set) {
+ if(set) {
+ BitsUtil.setI(cols, cnum);
+ colcard++;
+ }
+ else {
+ BitsUtil.clearI(cols, cnum);
+ colcard--;
+ }
+ }
+
+ /**
+ * Select or deselect a row.
+ *
+ * @param rnum Row to select
+ * @param set Value to set
+ */
+ protected void selectRow(int rnum, boolean set) {
+ if(set) {
+ BitsUtil.setI(rows, rnum);
+ rowcard++;
+ }
+ else {
+ BitsUtil.clearI(rows, rnum);
+ rowcard--;
+ }
+ }
+
+ protected void invertRow(int rnum, boolean b) {
+ BitsUtil.setI(irow, rnum);
+ }
+ }
+
+ @Override
+ public Clustering<BiclusterWithInversionsModel> biclustering() {
+ double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);
+
+ BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());
+
+ Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
+ ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
+ for(int i = 0; i < n; i++) {
+ cand.reset();
+ multipleNodeDeletion(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ singleNodeDeletion(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ nodeAddition(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ cand.maskMatrix(mat, dist);
+ BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
+ final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
+ noise.removeDBIDs(cids);
+ result.addToplevelCluster(new Cluster<>(cids, model));
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
+ LOG.verbose("Number of rows: " + cand.rowcard + "\n");
+ LOG.verbose("Number of columns: " + cand.colcard + "\n");
+ // LOG.verbose("Total number of masked values: " + maskedVals.size() +
+ // "\n");
+ }
+ if(prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ }
+ // Add a noise cluster, full-dimensional.
+ if(!noise.isEmpty()) {
+ long[] allcols = BitsUtil.ones(getColDim());
+ BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
+ result.addToplevelCluster(new Cluster<>(noise, true, model));
+ }
+ if(prog != null) {
+ prog.ensureCompleted(LOG);
+ }
+ return result;
+ }
+
+ /**
+ * Algorithm 1 of Cheng and Church:
+ *
+ * Remove single rows or columns.
+ *
+ * Inverted rows are not supported in this method.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
+ // Assume that cand.residue is up to date!
+ while(cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) {
+ // Store current maximum. Need final mutable, so use arrays.
+ final double[] max = { Double.NEGATIVE_INFINITY };
+ final int[] best = { -1, -1 };
+
+ // Test rows
+ if(cand.rowcard > 2) {
+ cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ double rowResidue = cand.computeRowResidue(mat, row, false);
+ if(max[0] < rowResidue) {
+ max[0] = rowResidue;
+ best[0] = row;
+ }
+ return false;
+ }
+ });
+ }
+
+ // Test columns:
+ if(cand.colcard > 2) {
+ cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ double colResidue = cand.computeColResidue(mat, col);
+ if(max[0] < colResidue) {
+ max[0] = colResidue;
+ best[1] = col;
+ }
+ return false;
+ }
+ });
+ }
+
+ if(best[1] >= 0) { // then override bestrow!
+ cand.selectColumn(best[1], false);
+ }
+ else {
+ assert (best[0] >= 0);
+ cand.selectRow(best[0], false);
+ }
+ // TODO: incremental update could be much faster?
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ }
+ }
+
+ //
+ /**
+ * Algorithm 2 of Cheng and Church.
+ *
+ * Remove all rows and columns that reduce the residue by alpha.
+ *
+ * Inverted rows are not supported in this method.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+
+ // Note: assumes that cand.residue = H(I,J)
+ while(cand.residue > delta) {
+ final boolean[] modified = { false, false };
+
+ // Step 2: remove rows above threshold
+ if(cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) {
+ final double alphaResidue = alpha * cand.residue;
+ cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ if(cand.computeRowResidue(mat, row, false) > alphaResidue) {
+ cand.selectRow(row, false);
+ modified[0] = true;
+ }
+ return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD);
+ }
+ });
+
+ // Step 3: update residue
+ if(modified[0]) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+ }
+
+ // Step 4: remove columns above threshold
+ if(cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) {
+ final double alphaResidue = alpha * cand.residue;
+ cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ if(cand.computeColResidue(mat, col) > alphaResidue) {
+ cand.selectColumn(col, false);
+ modified[1] = true;
+ }
+ return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD);
+ }
+ });
+ if(modified[1]) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+ }
+
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ // Step 5: if nothing has been removed, try removing single nodes.
+ if(!modified[0] && !modified[1]) {
+ break;
+ // Will be executed next in main loop, as per algorithm 4.
+ // singleNodeDeletion();
+ }
+ }
+ }
+
+ /**
+ * Algorithm 3 of Cheng and Church.
+ *
+ * Try to re-add rows or columns that decrease the overall score.
+ *
+ * Also try adding inverted rows.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ while(true) {
+ // We need this to be final + mutable
+ final boolean[] added = new boolean[] { false, false };
+
+ // Step 2: add columns
+ cand.visitRow(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selcol);
+ if(cand.computeColResidue(mat, col) <= cand.residue) {
+ cand.selectColumn(col, true);
+ added[0] = true;
+ }
+ return false;
+ }
+ });
+
+ // Step 3: recompute values
+ if(added[0]) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+
+ // Step 4: try adding rows.
+ cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selrow);
+ if(cand.computeRowResidue(mat, row, false) <= cand.residue) {
+ cand.selectRow(row, true);
+ added[1] = true;
+ }
+ return false;
+ }
+ });
+
+ // Step 5: try adding inverted rows.
+ if(useinverted) {
+ cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selrow);
+ if(cand.computeRowResidue(mat, row, true) <= cand.residue) {
+ cand.selectRow(row, true);
+ cand.invertRow(row, true);
+ added[1] = true;
+ }
+ return false;
+ }
+ });
+ }
+ if(added[1]) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ }
+ if(!added[0] && !added[1]) {
+ break;
+ }
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the distribution of replacement values when masking
+ * a cluster.
+ */
+ public static final OptionID DIST_ID = new OptionID("chengandchurch.replacement", "Distribution of replacement values when masking found clusters.");
+
+ /**
+ * Threshold value to determine the maximal acceptable score (mean squared
+ * residue) of a bicluster.
+ * <p/>
+ * Key: {@code -chengandchurch.delta}
+ * </p>
+ */
+ public static final OptionID DELTA_ID = new OptionID("chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster.");
+
+ /**
+ * Parameter for multiple node deletion to accelerate the algorithm. (&gt;=
+ * 1)
+ * <p/>
+ * Key: {@code -chengandchurch.alpha}
+ * </p>
+ */
+ public static final OptionID ALPHA_ID = new OptionID("chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm.");
+
+ /**
+ * Number of biclusters to be found.
+ * <p/>
+ * Default value: 1
+ * </p>
+ * <p/>
+ * Key: {@code -chengandchurch.n}
+ * </p>
+ */
+ public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found.");
+
+ /**
+ * Threshold for the score ({@link #DELTA_ID}).
+ */
+ private double delta;
+
+ /**
+ * The parameter for multiple node deletion.</p>
+ * <p>
+ * It is used to magnify the {@link #delta} value in the
+ * {@link ChengAndChurch#multipleNodeDeletion} method.
+ * </p>
+ */
+ private double alpha;
+
+ /**
+ * Number of biclusters to be found.
+ */
+ private int n;
+
+ /**
+ * Distribution of replacement values.
+ */
+ private Distribution dist;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID);
+ if(config.grab(deltaP)) {
+ delta = deltaP.doubleValue();
+ }
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+
+ IntParameter nP = new IntParameter(N_ID, 1);
+ nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(nP)) {
+ n = nP.intValue();
+ }
+
+ DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.);
+ alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+
+ ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class);
+ if(config.grab(distP)) {
+ dist = distP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected ChengAndChurch<V> makeInstance() {
+ return new ChengAndChurch<>(delta, alpha, n, dist);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java
new file mode 100644
index 00000000..21363bfc
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * <p>Biclustering algorithms.</p>
+ *
+ *
+ */
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2013
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
index 0d82add9..8e5fa627 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
@@ -74,7 +74,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
@@ -838,22 +838,22 @@ public class CASH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(minptsP)) {
minpts = minptsP.getValue();
}
IntParameter maxlevelP = new IntParameter(MAXLEVEL_ID);
- maxlevelP.addConstraint(new GreaterConstraint(0));
+ maxlevelP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(maxlevelP)) {
maxlevel = maxlevelP.getValue();
}
IntParameter mindimP = new IntParameter(MINDIM_ID, 1);
- mindimP.addConstraint(new GreaterConstraint(0));
+ mindimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(mindimP)) {
mindim = mindimP.getValue();
}
DoubleParameter jitterP = new DoubleParameter(JITTER_ID);
- jitterP.addConstraint(new GreaterConstraint(0));
+ jitterP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
if (config.grab(jitterP)) {
jitter = jitterP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
index 9a4b8512..68878aef 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/COPAC.java
@@ -29,7 +29,7 @@ import java.util.Map;
import java.util.Map.Entry;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
@@ -270,7 +270,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs
public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) {
ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters);
ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query);
- reconfig.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist);
+ reconfig.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist);
ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm);
reconfig.failOnErrors();
return instance;
@@ -335,7 +335,7 @@ public class COPAC<V extends NumberVector<?>, D extends Distance<D>> extends Abs
ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class);
if(config.grab(algP)) {
ListParameterization predefined = new ListParameterization();
- predefined.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI);
+ predefined.addParameter(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI);
TrackParameters trackpar = new TrackParameters(config);
ChainedParameterization chain = new ChainedParameterization(predefined, trackpar);
chain.errorsTo(config);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
index d1b714bf..79ddc16e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/HiCO.java
@@ -36,9 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -162,33 +160,34 @@ public class HiCO<V extends NumberVector<?>> extends OPTICS<V, PCACorrelationDis
super.makeOptions(config);
IntParameter muP = new IntParameter(MU_ID);
- muP.addConstraint(new GreaterConstraint(0));
- if (config.grab(muP)) {
+ muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(muP)) {
mu = muP.getValue();
}
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
kP.setOptional(true);
final int k;
- if (config.grab(kP)) {
+ if(config.grab(kP)) {
k = kP.getValue();
- } else {
+ }
+ else {
k = mu;
}
DoubleParameter deltaP = new DoubleParameter(DELTA_ID, DEFAULT_DELTA);
- deltaP.addConstraint(new GreaterEqualConstraint(0));
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
double delta = DEFAULT_DELTA;
- if (config.grab(deltaP)) {
+ if(config.grab(deltaP)) {
delta = deltaP.doubleValue();
}
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0));
- alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = DEFAULT_ALPHA;
- if (config.grab(alphaP)) {
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
index f9531be0..99144b42 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/LMCLUS.java
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -116,7 +116,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
* Number of sampling rounds to find a good split
*/
private final int samplingLevel;
-
+
/**
* Random factory
*/
@@ -163,34 +163,34 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
- Random r = rnd.getRandom();
+ Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
- while (unclustered.size() > minsize) {
+ while(unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
- for (int k = 1; k <= maxdim; k++) {
+ for(int k = 1; k <= maxdim; k++) {
// Implementation note: this while loop is from the original publication
// and the published LMCLUS source code. It doesn't make sense to me -
// it is lacking a stop criterion other than "cluster is too small" and
// "cluster is inseparable"! Additionally, there is good criterion for
// stopping at the appropriate dimensionality either.
- while (true) {
+ while(true) {
Separation separation = findSeparation(relation, current, k, r);
// logger.verbose("k: " + k + " goodness: " + separation.goodness +
// " threshold: " + separation.threshold);
- if (separation.goodness <= sensitivityThreshold) {
+ if(separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
- for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
- if (deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
+ for(DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
+ if(deviation(relation.get(iter).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
- if (subset.size() < minsize) {
+ if(subset.size() < minsize) {
break;
}
current = subset;
@@ -199,7 +199,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
}
}
// No more clusters found
- if (current.size() < minsize || current == unclustered) {
+ if(current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
@@ -210,22 +210,22 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
- if (progress != null) {
+ if(progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
- if (unclustered.size() > 0) {
+ if(unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
- if (progress != null) {
+ if(progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setCompleted(LOG);
}
return ret;
@@ -272,7 +272,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size());
// System.out.println("Number of samples: " + samples);
int remaining_retries = 100;
- for (int i = 1; i <= samples; i++) {
+ for(int i = 1; i <= samples; i++) {
DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong());
final DBIDIter iter = sample.iter();
// Use first as origin
@@ -282,17 +282,17 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
Matrix basis;
{
List<Vector> vectors = new ArrayList<>(sample.size() - 1);
- for (; iter.valid(); iter.advance()) {
+ for(; iter.valid(); iter.advance()) {
Vector vec = relation.get(iter).getColumnVector();
vectors.add(vec.minusEquals(originV));
}
// generate orthogonal basis
basis = generateOrthonormalBasis(vectors);
- if (basis == null) {
+ if(basis == null) {
// new sample has to be taken.
i--;
remaining_retries--;
- if (remaining_retries < 0) {
+ if(remaining_retries < 0) {
throw new AbortException("Too many retries in sampling, and always a linear dependant data set.");
}
continue;
@@ -301,9 +301,9 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
// Generate and fill a histogram.
DoubleDynamicHistogram histogram = new DoubleDynamicHistogram(BINS);
double w = 1.0 / currentids.size();
- for (DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = currentids.iter(); iter2.valid(); iter2.advance()) {
// Skip sampled points
- if (sample.contains(iter2)) {
+ if(sample.contains(iter2)) {
continue;
}
Vector vec = relation.get(iter2).getColumnVector().minusEquals(originV);
@@ -311,7 +311,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
histogram.increment(distance, w);
}
double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold
- if (th[1] > separation.goodness) {
+ if(th[1] > separation.goodness) {
separation.goodness = th[1];
separation.threshold = th[0];
separation.originV = originV;
@@ -341,16 +341,16 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
first = first.times(1.0 / first.euclideanLength());
Matrix ret = new Matrix(first.getDimensionality(), vectors.size());
ret.setCol(0, first);
- for (int i = 1; i < vectors.size(); i++) {
+ for(int i = 1; i < vectors.size(); i++) {
// System.out.println("Matrix:" + ret);
Vector v_i = vectors.get(i);
Vector u_i = v_i.copy();
// System.out.println("Vector " + i + ":" + partialSol);
- for (int j = 0; j < i; j++) {
+ for(int j = 0; j < i; j++) {
Vector v_j = ret.getCol(j);
double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j);
- if (Double.isNaN(f)) {
- if (LOG.isDebuggingFine()) {
+ if(Double.isNaN(f)) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Zero vector encountered? " + v_j);
}
return null;
@@ -359,8 +359,8 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
}
// check if the vectors weren't independent
final double len_u_i = u_i.euclideanLength();
- if (len_u_i == 0.0) {
- if (LOG.isDebuggingFine()) {
+ if(len_u_i == 0.0) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Points not independent - no orthonormalization.");
}
return null;
@@ -391,7 +391,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
{
MeanVariance mv = new MeanVariance();
DoubleHistogram.Iter forward = histogram.iter();
- for (int i = 0; forward.valid(); i++, forward.advance()) {
+ for(int i = 0; forward.valid(); i++, forward.advance()) {
p1[i] = forward.getValue() + ((i > 0) ? p1[i - 1] : 0);
mv.put(i, forward.getValue());
mu1[i] = mv.getMean();
@@ -404,7 +404,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
DoubleHistogram.Iter backwards = histogram.iter();
backwards.seek(histogram.getNumBins() - 1); // Seek to last
- for (int j = n - 1; backwards.valid(); j--, backwards.retract()) {
+ for(int j = n - 1; backwards.valid(); j--, backwards.retract()) {
p2[j] = backwards.getValue() + ((j + 1 < n) ? p2[j + 1] : 0);
mv.put(j, backwards.getValue());
mu2[j] = mv.getMean();
@@ -412,7 +412,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
}
}
- for (int i = 0; i < n; i++) {
+ for(int i = 0; i < n; i++) {
jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i])));
}
@@ -420,23 +420,23 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
double bestgoodness = Double.NEGATIVE_INFINITY;
double devPrev = jt[1] - jt[0];
- for (int i = 1; i < jt.length - 1; i++) {
+ for(int i = 1; i < jt.length - 1; i++) {
double devCur = jt[i + 1] - jt[i];
// System.out.println(p1[i]);
// System.out.println(jt[i + 1]);
// System.out.println(jt[i]);
// System.out.println(devCur);
// Local minimum found - calculate depth
- if (devCur >= 0 && devPrev <= 0) {
+ if(devCur >= 0 && devPrev <= 0) {
double lowestMaxima = Double.POSITIVE_INFINITY;
- for (int j = i - 1; j > 0; j--) {
- if (jt[j - 1] < jt[j]) {
+ for(int j = i - 1; j > 0; j--) {
+ if(jt[j - 1] < jt[j]) {
lowestMaxima = Math.min(lowestMaxima, jt[j]);
break;
}
}
- for (int j = i + 1; j < n - 2; j++) {
- if (jt[j + 1] < jt[j]) {
+ for(int j = i + 1; j < n - 2; j++) {
+ if(jt[j + 1] < jt[j]) {
lowestMaxima = Math.min(lowestMaxima, jt[j]);
break;
}
@@ -445,11 +445,11 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
final double mud = mu1[i] - mu2[i];
double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]);
- if (Double.isNaN(discriminability)) {
+ if(Double.isNaN(discriminability)) {
discriminability = -1;
}
double goodness = localDepth * discriminability;
- if (goodness > bestgoodness) {
+ if(goodness > bestgoodness) {
bestgoodness = goodness;
bestpos = i;
}
@@ -552,7 +552,7 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
* Threshold
*/
private double threshold;
-
+
/**
* Random generator
*/
@@ -562,26 +562,26 @@ public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter maxLMDimP = new IntParameter(MAXDIM_ID);
- maxLMDimP.addConstraint(new GreaterEqualConstraint(1));
+ maxLMDimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
maxLMDimP.setOptional(true);
- if (config.grab(maxLMDimP)) {
+ if(config.grab(maxLMDimP)) {
maxdim = maxLMDimP.getValue();
}
IntParameter minsizeP = new IntParameter(MINSIZE_ID);
- minsizeP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(minsizeP)) {
+ minsizeP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minsizeP)) {
minsize = minsizeP.getValue();
}
IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100);
- if (config.grab(samplingLevelP)) {
+ if(config.grab(samplingLevelP)) {
samplingLevel = samplingLevelP.getValue();
}
DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID);
- if (config.grab(sensivityThresholdP)) {
+ if(config.grab(sensivityThresholdP)) {
threshold = sensivityThresholdP.getValue();
}
RandomParameter rndP = new RandomParameter(RANDOM_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
index a9c67a58..7733ddaa 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/ORCLUS.java
@@ -61,8 +61,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -135,7 +134,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// current dimensionality associated with each seed
int dim_c = RelationUtil.dimensionality(relation);
- if (dim_c < l) {
+ if(dim_c < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + dim_c + " < " + l + ")");
}
@@ -149,8 +148,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null;
- while (k_c > k) {
- if (cprogress != null) {
+ while(k_c > k) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
@@ -158,8 +157,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
assign(relation, distFunc, clusters);
// determine current subspace associated with each cluster
- for (ORCLUSCluster cluster : clusters) {
- if (cluster.objectIDs.size() > 0) {
+ for(ORCLUSCluster cluster : clusters) {
+ if(cluster.objectIDs.size() > 0) {
cluster.basis = findBasis(relation, distFunc, cluster, dim_c);
}
}
@@ -172,18 +171,19 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
}
assign(relation, distFunc, clusters);
- if (cprogress != null) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size());
cprogress.setCompleted(LOG);
}
// get the result
Clustering<Model> r = new Clustering<>("ORCLUS clustering", "orclus-clustering");
- for (ORCLUSCluster c : clusters) {
+ for(ORCLUSCluster c : clusters) {
r.addToplevelCluster(new Cluster<Model>(c.objectIDs, ClusterModel.CLUSTER));
}
return r;
- } catch (Exception e) {
+ }
+ catch(Exception e) {
throw new IllegalStateException(e);
}
}
@@ -199,7 +199,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
DBIDs randomSample = DBIDUtil.randomSample(database.getDBIDs(), k, rnd);
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database);
List<ORCLUSCluster> seeds = new ArrayList<>();
- for (DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = randomSample.iter(); iter.valid(); iter.advance()) {
seeds.add(new ORCLUSCluster(database.get(iter), iter, factory));
}
return seeds;
@@ -217,29 +217,29 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
private void assign(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters) {
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(database);
// clear the current clusters
- for (ORCLUSCluster cluster : clusters) {
+ for(ORCLUSCluster cluster : clusters) {
cluster.objectIDs.clear();
}
// projected centroids of the clusters
List<V> projectedCentroids = new ArrayList<>(clusters.size());
- for (ORCLUSCluster c : clusters) {
+ for(ORCLUSCluster c : clusters) {
projectedCentroids.add(projection(c, c.centroid, factory));
}
// for each data point o do
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
V o = database.get(it);
DoubleDistance minDist = null;
ORCLUSCluster minCluster = null;
// determine projected distance between o and cluster
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < clusters.size(); i++) {
ORCLUSCluster c = clusters.get(i);
V o_proj = projection(c, o, factory);
DoubleDistance dist = distFunc.distance(o_proj, projectedCentroids.get(i));
- if (minDist == null || minDist.compareTo(dist) > 0) {
+ if(minDist == null || minDist.compareTo(dist) > 0) {
minDist = dist;
minCluster = c;
}
@@ -250,8 +250,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
}
// recompute the seed in each clusters
- for (ORCLUSCluster cluster : clusters) {
- if (cluster.objectIDs.size() > 0) {
+ for(ORCLUSCluster cluster : clusters) {
+ if(cluster.objectIDs.size() > 0) {
cluster.centroid = Centroid.make(database, cluster.objectIDs).toVector(database);
}
}
@@ -271,7 +271,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// covariance matrix of cluster
// Matrix covariance = Util.covarianceMatrix(database, cluster.objectIDs);
GenericDistanceDBIDList<DoubleDistance> results = new GenericDistanceDBIDList<>(cluster.objectIDs.size());
- for (DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) {
+ for(DBIDIter it = cluster.objectIDs.iter(); it.valid(); it.advance()) {
DoubleDistance distance = distFunc.distance(cluster.centroid, database.get(it));
results.add(distance, it);
}
@@ -304,9 +304,9 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
*/
private void merge(Relation<V> database, DistanceQuery<V, DoubleDistance> distFunc, List<ORCLUSCluster> clusters, int k_new, int d_new, IndefiniteProgress cprogress) {
ArrayList<ProjectedEnergy> projectedEnergies = new ArrayList<>();
- for (int i = 0; i < clusters.size(); i++) {
- for (int j = 0; j < clusters.size(); j++) {
- if (i >= j) {
+ for(int i = 0; i < clusters.size(); i++) {
+ for(int j = 0; j < clusters.size(); j++) {
+ if(i >= j) {
continue;
}
// projected energy of c_ij in subspace e_ij
@@ -318,8 +318,8 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
}
}
- while (clusters.size() > k_new) {
- if (cprogress != null) {
+ while(clusters.size() > k_new) {
+ if(cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
// find the smallest value of r_ij
@@ -327,12 +327,12 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// renumber the clusters by replacing cluster c_i with cluster c_ij
// and discarding cluster c_j
- for (int c = 0; c < clusters.size(); c++) {
- if (c == minPE.i) {
+ for(int c = 0; c < clusters.size(); c++) {
+ if(c == minPE.i) {
clusters.remove(c);
clusters.add(c, minPE.cluster);
}
- if (c == minPE.j) {
+ if(c == minPE.j) {
clusters.remove(c);
}
}
@@ -341,15 +341,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
int i = minPE.i;
int j = minPE.j;
Iterator<ProjectedEnergy> it = projectedEnergies.iterator();
- while (it.hasNext()) {
+ while(it.hasNext()) {
ProjectedEnergy pe = it.next();
- if (pe.i == i || pe.i == j || pe.j == i || pe.j == j) {
+ if(pe.i == i || pe.i == j || pe.j == i || pe.j == j) {
it.remove();
- } else {
- if (pe.i > j) {
+ }
+ else {
+ if(pe.i > j) {
pe.i -= 1;
}
- if (pe.j > j) {
+ if(pe.j > j) {
pe.j -= 1;
}
}
@@ -357,10 +358,11 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// ... and recompute them
ORCLUSCluster c_ij = minPE.cluster;
- for (int c = 0; c < clusters.size(); c++) {
- if (c < i) {
+ for(int c = 0; c < clusters.size(); c++) {
+ if(c < i) {
projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, c, i, d_new));
- } else if (c > i) {
+ }
+ else if(c > i) {
projectedEnergies.add(projectedEnergy(database, distFunc, clusters.get(c), c_ij, i, c, d_new));
}
}
@@ -389,7 +391,7 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
double sum = 0.;
V c_proj = projection(c_ij, c_ij.centroid, factory);
- for (DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = c_ij.objectIDs.iter(); iter.valid(); iter.advance()) {
V o_proj = projection(c_ij, database.get(iter), factory);
double dist = distFunc.distance(o_proj, c_proj).doubleValue();
sum += dist * dist;
@@ -417,15 +419,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
// convert into array.
c.objectIDs = DBIDUtil.newArray(c.objectIDs);
- if (c.objectIDs.size() > 0) {
+ if(c.objectIDs.size() > 0) {
c.centroid = Centroid.make(relation, c.objectIDs).toVector(relation);
c.basis = findBasis(relation, distFunc, c, dim);
- } else {
+ }
+ else {
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Vector cent = c1.centroid.getColumnVector().plusEquals(c2.centroid.getColumnVector()).timesEquals(0.5);
c.centroid = factory.newNumberVector(cent.getArrayRef());
double[][] doubles = new double[c1.basis.getRowDimensionality()][dim];
- for (int i = 0; i < dim; i++) {
+ for(int i = 0; i < dim; i++) {
doubles[i][i] = 1;
}
c.basis = new Matrix(doubles);
@@ -590,16 +593,16 @@ public class ORCLUS<V extends NumberVector<?>> extends AbstractProjectedClusteri
protected void configAlpha(Parameterization config) {
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.5);
- alphaP.addConstraint(new GreaterConstraint(0));
- alphaP.addConstraint(new LessEqualConstraint(1));
- if (config.grab(alphaP)) {
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
}
protected void configSeed(Parameterization config) {
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
index 545a8171..1b316c7c 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/EpsilonNeighborPredicate.java
@@ -23,7 +23,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.DistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
@@ -67,12 +68,12 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh
/**
* Range to query with
*/
- D epsilon;
+ protected D epsilon;
/**
* Distance function to use
*/
- DistanceFunction<O, D> distFunc;
+ protected DistanceFunction<O, D> distFunc;
/**
* Full constructor.
@@ -177,14 +178,14 @@ public class EpsilonNeighborPredicate<O, D extends Distance<D>> implements Neigh
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
// Get a distance function.
- ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
+ ObjectParameter<DistanceFunction<O, D>> distanceP = new ObjectParameter<>(DistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
D distanceFactory = null;
if(config.grab(distanceP)) {
distfun = distanceP.instantiateClass(config);
distanceFactory = distfun.getDistanceFactory();
}
// Get the epsilon parameter
- DistanceParameter<D> epsilonP = new DistanceParameter<>(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.EPSILON_ID, distanceFactory);
+ DistanceParameter<D> epsilonP = new DistanceParameter<>(DBSCAN.Parameterizer.EPSILON_ID, distanceFactory);
if(config.grab(epsilonP)) {
epsilon = epsilonP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
index a6e62e2e..ac7ba81d 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/MinPtsCorePredicate.java
@@ -23,6 +23,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
@@ -54,7 +55,7 @@ public class MinPtsCorePredicate implements CorePredicate {
/**
* The minpts parameter.
*/
- int minpts;
+ protected int minpts;
/**
* Default constructor.
@@ -127,7 +128,7 @@ public class MinPtsCorePredicate implements CorePredicate {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
// Get the minpts parameter
- IntParameter minptsP = new IntParameter(de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN.MINPTS_ID);
+ IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID);
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java
index ac5cb77c..f6dbc88f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java
@@ -55,7 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
@@ -178,9 +178,10 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DataStore<D> lambda = pointerresult.getParentDistanceStore();
Clustering<DendrogramModel<D>> result;
- if (lambda instanceof DoubleDistanceDataStore) {
+ if(lambda instanceof DoubleDistanceDataStore) {
result = extractClustersDouble(ids, pi, (DoubleDistanceDataStore) lambda);
- } else {
+ }
+ else {
result = extractClusters(ids, pi, lambda);
}
result.addChildResult(pointerresult);
@@ -208,28 +209,31 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDArrayIter it = order.iter(); // Used multiple times!
int split;
- if (minclusters > 0) {
+ if(minclusters > 0) {
split = Math.max(ids.size() - minclusters, 0);
// Stop distance:
final D stopdist = lambda.get(order.get(split));
// Tie handling: decrement split.
- while (split > 0) {
+ while(split > 0) {
it.seek(split - 1);
- if (stopdist.compareTo(lambda.get(it)) <= 0) {
+ if(stopdist.compareTo(lambda.get(it)) <= 0) {
split--;
- } else {
+ }
+ else {
break;
}
}
- } else if (threshold != null) {
+ }
+ else if(threshold != null) {
split = ids.size();
it.seek(split - 1);
- while (threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) {
+ while(threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) {
split--;
it.retract();
}
- } else { // full hierarchy
+ }
+ else { // full hierarchy
split = 0;
}
@@ -242,19 +246,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDVar succ = DBIDUtil.newVar(); // Variable for successor.
// Go backwards on the lower part.
- for (it.seek(split - 1); it.valid(); it.retract()) {
+ for(it.seek(split - 1); it.valid(); it.retract()) {
D dist = lambda.get(it); // Distance to successor
pi.assignVar(it, succ); // succ = pi(it)
int clusterid = cluster_map.intValue(succ);
// Successor cluster has already been created:
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
cluster_dbids.get(clusterid).add(it);
cluster_map.putInt(it, clusterid);
// Update distance to maximum encountered:
- if (cluster_dist.get(clusterid).compareTo(dist) < 0) {
+ if(cluster_dist.get(clusterid).compareTo(dist) < 0) {
cluster_dist.set(clusterid, dist);
}
- } else {
+ }
+ else {
// Need to start a new cluster:
clusterid = cluster_dbids.size(); // next cluster number.
ModifiableDBIDs cids = DBIDUtil.newArray();
@@ -270,12 +275,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
final Clustering<DendrogramModel<D>> dendrogram;
- switch(outputmode) {
+ switch(outputmode){
case PARTIAL_HIERARCHY: {
// Build a hierarchy out of these clusters.
dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering");
@@ -284,74 +289,81 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i)));
}
cluster_dist = null; // Invalidate
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
// The current cluster led by the current element:
final Cluster<DendrogramModel<D>> clus;
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
clus = clusters.get(clusterid);
- } else if (!singletons && ids.size() != 1) {
+ }
+ else if(!singletons && ids.size() != 1) {
clus = null;
- } else {
+ }
+ else {
clus = makeCluster(it, null, DBIDUtil.deref(it));
}
// The successor to join:
pi.assignVar(it, succ); // succ = pi(it)
- if (DBIDUtil.equal(it, succ)) {
+ if(DBIDUtil.equal(it, succ)) {
assert (root == null);
root = clus;
- } else {
+ }
+ else {
// Parent cluster:
int parentid = cluster_map.intValue(succ);
D depth = lambda.get(it);
// Parent cluster exists - merge as a new cluster:
- if (parentid >= 0) {
+ if(parentid >= 0) {
final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid);
- if (pclus.getModel().getDistance().equals(depth)) {
- if (clus == null) {
+ if(pclus.getModel().getDistance().equals(depth)) {
+ if(clus == null) {
((ModifiableDBIDs) pclus.getIDs()).add(it);
- } else {
+ }
+ else {
dendrogram.addChildCluster(pclus, clus);
}
- } else {
+ }
+ else {
// Merge at new depth:
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids);
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(npclus, clus);
}
dendrogram.addChildCluster(npclus, pclus);
// Replace existing parent cluster: new depth
clusters.set(parentid, npclus);
}
- } else {
+ }
+ else {
// Merge with parent at this depth:
final Cluster<DendrogramModel<D>> pclus;
- if (!singletons) {
+ if(!singletons) {
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1);
cids.add(succ);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
// New cluster for parent and/or new point
pclus = makeCluster(succ, depth, cids);
- } else {
+ }
+ else {
// Create a new, one-element cluster for parent, and a merged
// cluster on top.
pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS);
dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ)));
}
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(pclus, clus);
}
// Store cluster:
@@ -362,7 +374,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -377,21 +389,21 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
dendrogram.addToplevelCluster(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i)));
}
cluster_dist = null; // Invalidate
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
- if (clusterid < 0) {
+ if(clusterid < 0) {
dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it)));
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -401,7 +413,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
throw new AbortException("Unsupported output mode.");
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
@@ -428,29 +440,32 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDArrayIter it = order.iter(); // Used multiple times!
int split;
- if (minclusters > 0) {
+ if(minclusters > 0) {
split = Math.max(ids.size() - minclusters, 0);
// Stop distance:
final double stopdist = lambda.doubleValue(order.get(split));
// Tie handling: decrement split.
- while (split > 0) {
+ while(split > 0) {
it.seek(split - 1);
- if (stopdist <= lambda.doubleValue(it)) {
+ if(stopdist <= lambda.doubleValue(it)) {
split--;
- } else {
+ }
+ else {
break;
}
}
- } else if (threshold != null) {
+ }
+ else if(threshold != null) {
split = ids.size();
it.seek(split - 1);
double stopdist = ((DoubleDistance) threshold).doubleValue();
- while (stopdist <= lambda.doubleValue(it) && it.valid()) {
+ while(stopdist <= lambda.doubleValue(it) && it.valid()) {
split--;
it.retract();
}
- } else { // full hierarchy
+ }
+ else { // full hierarchy
split = 0;
}
@@ -463,19 +478,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
DBIDVar succ = DBIDUtil.newVar(); // Variable for successor.
// Go backwards on the lower part.
- for (it.seek(split - 1); it.valid(); it.retract()) {
+ for(it.seek(split - 1); it.valid(); it.retract()) {
double dist = lambda.doubleValue(it); // Distance to successor
pi.assignVar(it, succ); // succ = pi(it)
int clusterid = cluster_map.intValue(succ);
// Successor cluster has already been created:
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
cluster_dbids.get(clusterid).add(it);
cluster_map.putInt(it, clusterid);
// Update distance to maximum encountered:
- if (cluster_dist.get(clusterid) < dist) {
+ if(cluster_dist.get(clusterid) < dist) {
cluster_dist.set(clusterid, dist);
}
- } else {
+ }
+ else {
// Need to start a new cluster:
clusterid = cluster_dbids.size(); // next cluster number.
ModifiableDBIDs cids = DBIDUtil.newArray();
@@ -491,12 +507,12 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
final Clustering<DendrogramModel<D>> dendrogram;
- switch(outputmode) {
+ switch(outputmode){
case PARTIAL_HIERARCHY: {
// Build a hierarchy out of these clusters.
dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering");
@@ -505,7 +521,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
@SuppressWarnings("unchecked")
D depth = (D) new DoubleDistance(cluster_dist.get(i));
clusters.add(makeCluster(it2, depth, cluster_dbids.get(i)));
@@ -514,68 +530,75 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
// The current cluster led by the current element:
final Cluster<DendrogramModel<D>> clus;
- if (clusterid >= 0) {
+ if(clusterid >= 0) {
clus = clusters.get(clusterid);
- } else if (!singletons && ids.size() != 1) {
+ }
+ else if(!singletons && ids.size() != 1) {
clus = null;
- } else {
+ }
+ else {
clus = makeCluster(it, null, DBIDUtil.deref(it));
}
// The successor to join:
pi.assignVar(it, succ); // succ = pi(it)
- if (DBIDUtil.equal(it, succ)) {
+ if(DBIDUtil.equal(it, succ)) {
assert (root == null);
root = clus;
- } else {
+ }
+ else {
// Parent cluster:
int parentid = cluster_map.intValue(succ);
@SuppressWarnings("unchecked")
D depth = (D) new DoubleDistance(lambda.doubleValue(it));
// Parent cluster exists - merge as a new cluster:
- if (parentid >= 0) {
+ if(parentid >= 0) {
final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid);
- if (pclus.getModel().getDistance().equals(depth)) {
- if (clus == null) {
+ if(pclus.getModel().getDistance().equals(depth)) {
+ if(clus == null) {
((ModifiableDBIDs) pclus.getIDs()).add(it);
- } else {
+ }
+ else {
dendrogram.addChildCluster(pclus, clus);
}
- } else {
+ }
+ else {
// Merge at new depth:
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids);
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(npclus, clus);
}
dendrogram.addChildCluster(npclus, pclus);
// Replace existing parent cluster: new depth
clusters.set(parentid, npclus);
}
- } else {
+ }
+ else {
// Merge with parent at this depth:
final Cluster<DendrogramModel<D>> pclus;
- if (!singletons) {
+ if(!singletons) {
ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1);
cids.add(succ);
- if (clus == null) {
+ if(clus == null) {
cids.add(it);
}
// New cluster for parent and/or new point
pclus = makeCluster(succ, depth, cids);
- } else {
+ }
+ else {
// Create a new, one-element cluster for parent, and a merged
// cluster on top.
pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS);
dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ)));
}
- if (clus != null) {
+ if(clus != null) {
dendrogram.addChildCluster(pclus, clus);
}
// Store cluster:
@@ -586,7 +609,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -601,7 +624,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
// Convert initial clusters to cluster objects
{
int i = 0;
- for (DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
+ for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) {
@SuppressWarnings("unchecked")
D depth = (D) new DoubleDistance(cluster_dist.get(i));
dendrogram.addToplevelCluster(makeCluster(it2, depth, cluster_dbids.get(i)));
@@ -610,14 +633,14 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
cluster_dbids = null; // Invalidate
}
// Process the upper part, bottom-up.
- for (it.seek(split); it.valid(); it.advance()) {
+ for(it.seek(split); it.valid(); it.advance()) {
int clusterid = cluster_map.intValue(it);
- if (clusterid < 0) {
+ if(clusterid < 0) {
dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it)));
}
// Decrement counter
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
@@ -627,7 +650,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
throw new AbortException("Unsupported output mode.");
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
@@ -644,13 +667,16 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
*/
private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members) {
final String name;
- if (members.size() == 0) {
+ if(members.size() == 0) {
name = "mrg_" + DBIDUtil.toString(lead) + "_" + depth;
- } else if (depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) {
+ }
+ else if(depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) {
name = "obj_" + DBIDUtil.toString(lead);
- } else if (depth != null) {
+ }
+ else if(depth != null) {
name = "clu_" + DBIDUtil.toString(lead) + "_" + depth;
- } else {
+ }
+ else {
// Complete data set only?
name = "clu_" + DBIDUtil.toString(lead);
}
@@ -794,53 +820,54 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<HierarchicalClusteringAlgorithm<D>> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class);
- if (config.grab(algorithmP)) {
+ if(config.grab(algorithmP)) {
algorithm = algorithmP.instantiateClass(config);
}
EnumParameter<ThresholdMode> modeP = new EnumParameter<>(MODE_ID, ThresholdMode.class, ThresholdMode.BY_MINCLUSTERS);
- if (config.grab(modeP)) {
+ if(config.grab(modeP)) {
thresholdmode = modeP.getValue();
}
- if (thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) {
+ if(thresholdmode == null || ThresholdMode.BY_MINCLUSTERS.equals(thresholdmode)) {
IntParameter minclustersP = new IntParameter(MINCLUSTERS_ID);
- minclustersP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(minclustersP)) {
+ minclustersP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(minclustersP)) {
minclusters = minclustersP.intValue();
}
}
- if (thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) {
+ if(thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) {
// Fallback to double when no algorithm chosen yet:
@SuppressWarnings("unchecked")
final D factory = algorithm != null ? algorithm.getDistanceFactory() : (D) DoubleDistance.FACTORY;
DistanceParameter<D> distP = new DistanceParameter<>(THRESHOLD_ID, factory);
- if (config.grab(distP)) {
+ if(config.grab(distP)) {
threshold = distP.getValue();
}
}
- if (thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) {
+ if(thresholdmode == null || !ThresholdMode.NO_THRESHOLD.equals(thresholdmode)) {
EnumParameter<OutputMode> outputP = new EnumParameter<>(OUTPUTMODE_ID, OutputMode.class);
- if (config.grab(outputP)) {
+ if(config.grab(outputP)) {
outputmode = outputP.getValue();
}
- } else {
+ }
+ else {
// This becomes full hierarchy:
minclusters = -1;
outputmode = OutputMode.PARTIAL_HIERARCHY;
}
Flag singletonsF = new Flag(SINGLETONS_ID);
- if (config.grab(singletonsF)) {
+ if(config.grab(singletonsF)) {
singletons = singletonsF.isTrue();
}
}
@Override
protected ExtractFlatClusteringFromHierarchy<D> makeInstance() {
- switch(thresholdmode) {
+ switch(thresholdmode){
case NO_THRESHOLD:
case BY_MINCLUSTERS:
return new ExtractFlatClusteringFromHierarchy<>(algorithm, minclusters, outputmode, singletons);
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
index dc1fa47c..5754e961 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/AbstractKMeans.java
@@ -35,6 +35,7 @@ import de.lmu.ifi.dbs.elki.data.model.MeanModel;
import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
@@ -49,8 +50,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -105,68 +105,61 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param relation the database to cluster
* @param means a list of k means
* @param clusters cluster assignment
+ * @param assignment Current cluster assignment
* @return true when the object was reassigned
*/
- protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters) {
+ protected boolean assignToNearestCluster(Relation<V> relation, List<? extends NumberVector<?>> means, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
boolean changed = false;
- if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
- if (dist < mindist) {
+ if(dist < mindist) {
minIndex = i;
mindist = dist;
}
}
- if (clusters.get(minIndex).add(iditer)) {
- changed = true;
- // Remove from previous cluster
- // TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
- break;
- }
- }
- }
- }
+ changed |= updateAssignment(iditer, clusters, assignment, minIndex);
}
- } else {
+ }
+ else {
final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
- if (dist.compareTo(mindist) < 0) {
+ if(dist.compareTo(mindist) < 0) {
minIndex = i;
mindist = dist;
}
}
- if (clusters.get(minIndex).add(iditer)) {
- changed = true;
- // Remove from previous cluster
- // TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
- break;
- }
- }
- }
- }
+ changed |= updateAssignment(iditer, clusters, assignment, minIndex);
}
}
return changed;
}
+ protected boolean updateAssignment(DBIDIter iditer, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, int newA) {
+ final int oldA = assignment.intValue(iditer);
+ if(oldA == newA) {
+ return false;
+ }
+ clusters.get(newA).add(iditer);
+ assignment.putInt(iditer, newA);
+ if(oldA >= 0) {
+ clusters.get(oldA).remove(iditer);
+ }
+ return true;
+ }
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(new CombinedTypeInformation(TypeUtil.NUMBER_VECTOR_FIELD, getDistanceFunction().getInputTypeRestriction()));
@@ -181,24 +174,28 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @return the mean vectors of the given clusters in the given database
*/
protected List<Vector> means(List<? extends ModifiableDBIDs> clusters, List<? extends NumberVector<?>> means, Relation<V> database) {
+ // TODO: use Kahan summation for better numerical precision?
List<Vector> newMeans = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
ModifiableDBIDs list = clusters.get(i);
Vector mean = null;
- if (list.size() > 0) {
- double s = 1.0 / list.size();
+ if(list.size() > 0) {
DBIDIter iter = list.iter();
- assert (iter.valid());
- mean = database.get(iter).getColumnVector().timesEquals(s);
+ // Initialize with first.
+ mean = database.get(iter).getColumnVector();
double[] raw = mean.getArrayRef();
iter.advance();
- for (; iter.valid(); iter.advance()) {
+ // Update with remaining instances
+ for(; iter.valid(); iter.advance()) {
NumberVector<?> vec = database.get(iter);
- for (int j = 0; j < mean.getDimensionality(); j++) {
- raw[j] += s * vec.doubleValue(j);
+ for(int j = 0; j < mean.getDimensionality(); j++) {
+ raw[j] += vec.doubleValue(j);
}
}
- } else {
+ mean.timesEquals(1.0 / list.size());
+ }
+ else {
+ // Keep degenerated means as-is for now.
mean = means.get(i).getColumnVector();
}
newMeans.add(mean);
@@ -218,17 +215,18 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
final int dim = medians.get(0).getDimensionality();
final SortDBIDsBySingleDimension sorter = new SortDBIDsBySingleDimension(database);
List<NumberVector<?>> newMedians = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
ArrayModifiableDBIDs list = DBIDUtil.newArray(clusters.get(i));
- if (list.size() > 0) {
+ if(list.size() > 0) {
Vector mean = new Vector(dim);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
sorter.setDimension(d);
DBID id = QuickSelect.median(list, sorter);
mean.set(d, database.get(id).doubleValue(d));
}
newMedians.add(mean);
- } else {
+ }
+ else {
newMedians.add((NumberVector<?>) medians.get(i));
}
}
@@ -244,14 +242,11 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param op Cluster size change / Weight change
*/
protected void incrementalUpdateMean(Vector mean, V vec, int newsize, double op) {
- if (newsize == 0) {
+ if(newsize == 0) {
return; // Keep old mean
}
- Vector delta = vec.getColumnVector();
- // Compute difference from mean
- delta.minusEquals(mean);
- delta.timesEquals(op / newsize);
- mean.plusEquals(delta);
+ Vector delta = vec.getColumnVector().minusEquals(mean);
+ mean.plusTimesEquals(delta, op / newsize);
}
/**
@@ -260,76 +255,84 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
* @param relation Relation
* @param means Means
* @param clusters Clusters
+ * @param assignment Current cluster assignment
* @return true when the means have changed
*/
- protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters) {
+ protected boolean macQueenIterate(Relation<V> relation, List<Vector> means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
boolean changed = false;
- if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
// Raw distance function
@SuppressWarnings("unchecked")
final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
// Incremental update
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double mindist = Double.POSITIVE_INFINITY;
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
double dist = df.doubleDistance(fv, means.get(i));
- if (dist < mindist) {
+ if(dist < mindist) {
minIndex = i;
mindist = dist;
}
}
- // Update the cluster mean incrementally:
- for (int i = 0; i < k; i++) {
- ModifiableDBIDs ci = clusters.get(i);
- if (i == minIndex) {
- if (ci.add(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
- changed = true;
- }
- } else if (ci.remove(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
- changed = true;
- }
- }
+ changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
}
- } else {
+ }
+ else {
// Raw distance function
final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
// Incremental update
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
D mindist = df.getDistanceFactory().infiniteDistance();
V fv = relation.get(iditer);
int minIndex = 0;
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
D dist = df.distance(fv, means.get(i));
- if (dist.compareTo(mindist) < 0) {
+ if(dist.compareTo(mindist) < 0) {
minIndex = i;
mindist = dist;
}
}
- // Update the cluster mean incrementally:
- for (int i = 0; i < k; i++) {
- ModifiableDBIDs ci = clusters.get(i);
- if (i == minIndex) {
- if (ci.add(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size(), +1);
- changed = true;
- }
- } else if (ci.remove(iditer)) {
- incrementalUpdateMean(means.get(i), fv, ci.size() + 1, -1);
- changed = true;
- }
- }
+ changed |= updateMeanAndAssignment(clusters, means, minIndex, fv, iditer, assignment);
}
}
return changed;
}
+ /**
+ * Try to update the cluster assignment.
+ *
+ * @param clusters Current clusters
+ * @param means Means to update
+ * @param minIndex Cluster to assign to
+ * @param fv Vector
+ * @param iditer Object ID
+ * @param assignment Current cluster assignment
+ * @return {@code true} when assignment changed
+ */
+ private boolean updateMeanAndAssignment(List<ModifiableDBIDs> clusters, List<Vector> means, int minIndex, V fv, DBIDIter iditer, WritableIntegerDataStore assignment) {
+ int cur = assignment.intValue(iditer);
+ if(cur == minIndex) {
+ return false;
+ }
+ final ModifiableDBIDs curclus = clusters.get(minIndex);
+ curclus.add(iditer);
+ incrementalUpdateMean(means.get(minIndex), fv, curclus.size(), +1);
+
+ if(cur >= 0) {
+ ModifiableDBIDs ci = clusters.get(cur);
+ ci.remove(iditer);
+ incrementalUpdateMean(means.get(cur), fv, ci.size() + 1, -1);
+ }
+
+ assignment.putInt(iditer, minIndex);
+ return true;
+ }
+
@Override
public void setK(int k) {
this.k = k;
@@ -366,27 +369,27 @@ public abstract class AbstractKMeans<V extends NumberVector<?>, D extends Distan
@Override
protected void makeOptions(Parameterization config) {
ObjectParameter<PrimitiveDistanceFunction<NumberVector<?>, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, PrimitiveDistanceFunction.class);
- if (config.grab(distanceFunctionP)) {
+ if(config.grab(distanceFunctionP)) {
distanceFunction = distanceFunctionP.instantiateClass(config);
- if (!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
+ if(!(distanceFunction instanceof EuclideanDistanceFunction) && !(distanceFunction instanceof SquaredEuclideanDistanceFunction)) {
getLogger().warning("k-means optimizes the sum of squares - it should be used with squared euclidean distance and may stop converging otherwise!");
}
}
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
ObjectParameter<KMeansInitialization<V>> initialP = new ObjectParameter<>(INIT_ID, KMeansInitialization.class, RandomlyChosenInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
IntParameter maxiterP = new IntParameter(MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
index 30bb640c..51e7ace9 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/BestOfMultipleKMeans.java
@@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -90,34 +90,35 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
- if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
+ if(!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass());
}
final PrimitiveDistanceFunction<? super V, D> df = (PrimitiveDistanceFunction<? super V, D>) innerkMeans.getDistanceFunction();
Clustering<M> bestResult = null;
- if (trials > 1) {
+ if(trials > 1) {
double bestCost = Double.POSITIVE_INFINITY;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
- for (int i = 0; i < trials; i++) {
+ for(int i = 0; i < trials; i++) {
Clustering<M> currentCandidate = innerkMeans.run(database, relation);
double currentCost = qualityMeasure.calculateCost(currentCandidate, df, relation);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Cost of candidate " + i + ": " + currentCost);
}
- if (currentCost < bestCost) {
+ if(currentCost < bestCost) {
bestResult = currentCandidate;
bestCost = currentCost;
}
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
- } else {
+ }
+ else {
bestResult = innerkMeans.run(database);
}
@@ -195,18 +196,18 @@ public class BestOfMultipleKMeans<V extends NumberVector<?>, D extends Distance<
@Override
protected void makeOptions(Parameterization config) {
IntParameter trialsP = new IntParameter(TRIALS_ID);
- trialsP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(trialsP)) {
+ trialsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(trialsP)) {
trials = trialsP.intValue();
}
ObjectParameter<KMeans<V, D, M>> kMeansVariantP = new ObjectParameter<>(KMEANS_ID, KMeans.class);
- if (config.grab(kMeansVariantP)) {
+ if(config.grab(kMeansVariantP)) {
kMeansVariant = kMeansVariantP.instantiateClass(config);
}
ObjectParameter<KMeansQualityMeasure<V, ? super D>> qualityMeasureP = new ObjectParameter<>(QUALITYMEASURE_ID, KMeansQualityMeasure.class);
- if (config.grab(qualityMeasureP)) {
+ if(config.grab(qualityMeasureP)) {
qualityMeasure = qualityMeasureP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
index a018c04b..9edfd816 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/FarthestPointsInitialMeans.java
@@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
*/
import java.util.ArrayList;
import java.util.List;
-import java.util.Random;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.database.Database;
@@ -74,7 +73,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
@Override
public List<V> chooseInitialMeans(Database database, Relation<V> relation, int k, PrimitiveDistanceFunction<? super NumberVector<?>, ?> distanceFunction) {
// Get a distance query
- if (!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
+ if(!(distanceFunction.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
@@ -84,26 +83,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
List<V> means = new ArrayList<>(k);
- Random random = rnd.getRandom();
- DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter();
+ DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter();
means.add(relation.get(first));
DBIDVar best = DBIDUtil.newVar(first);
- for (int i = (dropfirst ? 0 : 1); i < k; i++) {
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
// Find farthest object:
double maxdist = Double.NEGATIVE_INFINITY;
- for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
double dsum = 0.;
- for (V ex : means) {
+ for(V ex : means) {
dsum += distQ.distance(ex, it).doubleValue();
}
- if (dsum > maxdist) {
+ if(dsum > maxdist) {
maxdist = dsum;
best.set(it);
}
}
// Add new mean:
- if (k == 0) {
+ if(k == 0) {
means.clear(); // Remove temporary first element.
}
means.add(relation.get(best));
@@ -114,7 +112,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
@Override
public DBIDs chooseInitialMedoids(int k, DistanceQuery<? super V, ?> distQ2) {
- if (!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
+ if(!(distQ2.getDistanceFactory() instanceof NumberDistance)) {
throw new AbortException("Farthest points K-Means initialization can only be used with numerical distances.");
}
@SuppressWarnings("unchecked")
@@ -123,26 +121,25 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
- Random random = rnd.getRandom();
- DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter();
+ DBIDIter first = DBIDUtil.randomSample(relation.getDBIDs(), 1, rnd).iter();
means.add(first);
DBIDVar best = DBIDUtil.newVar(first);
- for (int i = (dropfirst ? 0 : 1); i < k; i++) {
+ for(int i = (dropfirst ? 0 : 1); i < k; i++) {
// Find farthest object:
double maxdist = Double.NEGATIVE_INFINITY;
- for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
double dsum = 0.;
- for (DBIDIter ex = means.iter(); ex.valid(); ex.advance()) {
+ for(DBIDIter ex = means.iter(); ex.valid(); ex.advance()) {
dsum += distQ.distance(ex, it).doubleValue();
}
- if (dsum > maxdist) {
+ if(dsum > maxdist) {
maxdist = dsum;
best.set(it);
}
}
// Add new mean:
- if (k == 0) {
+ if(k == 0) {
means.clear(); // Remove temporary first element.
}
means.add(best);
@@ -173,7 +170,7 @@ public class FarthestPointsInitialMeans<V, D extends NumberDistance<D, ?>> exten
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
Flag dropfirstP = new Flag(DROPFIRST_ID);
- if (config.grab(dropfirstP)) {
+ if(config.grab(dropfirstP)) {
dropfirst = dropfirstP.isTrue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
new file mode 100644
index 00000000..aec4fe0f
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBatchedLloyd.java
@@ -0,0 +1,346 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * Provides the k-means algorithm, using Lloyd-style bulk iterations.
+ *
+ * However, in contrast to Lloyd's k-means and similar to MacQueen, we do update
+ * the mean vectors multiple times, not only at the very end of the iteration.
+ * This should yield faster convergence at little extra cost.
+ *
+ * To avoid issues with ordered data, we use random sampling to obtain the data
+ * blocks.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+public class KMeansBatchedLloyd<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansBatchedLloyd.class);
+
+ /**
+ * Number of blocks to use.
+ */
+ int blocks;
+
+ /**
+ * Random used for partitioning.
+ */
+ RandomFactory random;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ * @param blocks Number of blocks
+ * @param random Random factory used for partitioning.
+ */
+ public KMeansBatchedLloyd(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer, int blocks, RandomFactory random) {
+ super(distanceFunction, k, maxiter, initializer);
+ this.blocks = blocks;
+ this.random = random;
+ }
+
+ @Override
+ public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+ // Choose initial means
+ List<? extends NumberVector<?>> mvs = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
+ // Convert to (modifiable) math vectors.
+ List<Vector> means = new ArrayList<>(k);
+ for (NumberVector<?> m : mvs) {
+ means.add(m.getColumnVector());
+ }
+
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for (int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+
+ ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), blocks, random);
+
+ double[][] meanshift = new double[k][dim];
+ int[] changesize = new int[k];
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ boolean changed = false;
+ FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null;
+ for (int p = 0; p < parts.length; p++) {
+ // Initialize new means scratch space.
+ for (int i = 0; i < k; i++) {
+ Arrays.fill(meanshift[i], 0.);
+ }
+ Arrays.fill(changesize, 0);
+ changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment);
+ // Recompute means.
+ updateMeans(means, meanshift, clusters, changesize);
+ if (pprog != null) {
+ pprog.incrementProcessed(LOG);
+ }
+ }
+ if (pprog != null) {
+ pprog.ensureCompleted(LOG);
+ }
+ // Stop if no cluster assignment changed.
+ if (!changed) {
+ break;
+ }
+ }
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
+
+ // Wrap result
+ final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
+ Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for (int i = 0; i < clusters.size(); i++) {
+ KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param relation the database to cluster
+ * @param ids IDs to process
+ * @param oldmeans a list of k means
+ * @param meanshift delta to apply to each mean
+ * @param changesize New cluster sizes
+ * @param clusters cluster assignment
+ * @param assignment Current cluster assignment
+ * @return true when the object was reassigned
+ */
+ protected boolean assignToNearestCluster(Relation<V> relation, DBIDs ids, List<? extends NumberVector<?>> oldmeans, double[][] meanshift, int[] changesize, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment) {
+ boolean changed = false;
+
+ if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction) {
+ @SuppressWarnings("unchecked")
+ final PrimitiveDoubleDistanceFunction<? super NumberVector<?>> df = (PrimitiveDoubleDistanceFunction<? super NumberVector<?>>) getDistanceFunction();
+ for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY;
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for (int i = 0; i < k; i++) {
+ double dist = df.doubleDistance(fv, oldmeans.get(i));
+ if (dist < mindist) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
+ }
+ } else {
+ final PrimitiveDistanceFunction<? super NumberVector<?>, D> df = getDistanceFunction();
+ for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ D mindist = df.getDistanceFactory().infiniteDistance();
+ V fv = relation.get(iditer);
+ int minIndex = 0;
+ for (int i = 0; i < k; i++) {
+ D dist = df.distance(fv, oldmeans.get(i));
+ if (dist.compareTo(mindist) < 0) {
+ minIndex = i;
+ mindist = dist;
+ }
+ }
+ changed |= updateAssignment(iditer, fv, clusters, assignment, meanshift, changesize, minIndex);
+ }
+ }
+ return changed;
+ }
+
+ /**
+ * Update the assignment of a single object.
+ *
+ * @param id Object to assign
+ * @param fv Vector
+ * @param clusters Clusters
+ * @param assignment Current cluster assignment
+ * @param meanshift Current shifting offset
+ * @param changesize Size change of the current cluster
+ * @param minIndex Index of best cluster.
+ * @return {@code true} when assignment changed.
+ */
+ protected boolean updateAssignment(DBIDIter id, V fv, List<? extends ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[][] meanshift, int[] changesize, int minIndex) {
+ int cur = assignment.intValue(id);
+ if (cur == minIndex) {
+ return false;
+ }
+ // Add to new cluster.
+ {
+ clusters.get(minIndex).add(id);
+ changesize[minIndex]++;
+ double[] raw = meanshift[minIndex];
+ for (int j = 0; j < fv.getDimensionality(); j++) {
+ raw[j] += fv.doubleValue(j);
+ }
+ }
+ // Remove from previous cluster
+ if (cur >= 0) {
+ clusters.get(cur).remove(id);
+ changesize[cur]--;
+ double[] raw = meanshift[cur];
+ for (int j = 0; j < fv.getDimensionality(); j++) {
+ raw[j] -= fv.doubleValue(j);
+ }
+ }
+ assignment.putInt(id, minIndex);
+ return true;
+ }
+
+ /**
+ * Merge changes into mean vectors.
+ *
+ * @param means Mean vectors
+ * @param meanshift Shift offset
+ * @param clusters
+ * @param changesize Size of change (for weighting!)
+ */
+ protected void updateMeans(List<Vector> means, double[][] meanshift, List<ModifiableDBIDs> clusters, int[] changesize) {
+ for (int i = 0; i < k; i++) {
+ int newsize = clusters.get(i).size(), oldsize = newsize - changesize[i];
+ if (newsize == 0) {
+ continue; // Keep previous mean vector.
+ }
+ if (oldsize == 0) {
+ means.set(i, new Vector(meanshift[i]).times(1. / newsize));
+ continue; // Replace with new vector.
+ }
+ if (oldsize == newsize) {
+ means.get(i).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize);
+ continue;
+ }
+ means.get(i).timesEquals(oldsize / (double) newsize).plusTimesEquals(new Vector(meanshift[i]), 1. / (double) newsize);
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ /**
+ * Parameter for the number of blocks.
+ */
+ public static final OptionID BLOCKS_ID = new OptionID("kmeans.blocks", "Number of blocks to use for processing. Means will be recomputed after each block.");
+
+ /**
+ * Random source for blocking.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("kmeans.blocks.random", "Random source for producing blocks.");
+
+ /**
+ * Number of blocks.
+ */
+ int blocks;
+
+ /**
+ * Random used for partitioning.
+ */
+ RandomFactory random;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter blocksP = new IntParameter(BLOCKS_ID, 10);
+ blocksP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if (config.grab(blocksP)) {
+ blocks = blocksP.intValue();
+ }
+ RandomParameter randomP = new RandomParameter(RANDOM_ID);
+ if (config.grab(randomP)) {
+ random = randomP.getValue();
+ }
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected KMeansBatchedLloyd<V, D> makeInstance() {
+ return new KMeansBatchedLloyd<>(distanceFunction, k, maxiter, initializer, blocks, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
index 37071d36..80a581b1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansBisecting.java
@@ -41,7 +41,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -205,7 +205,7 @@ public class KMeansBisecting<V extends NumberVector<?>, D extends Distance<?>, M
super.makeOptions(config);
IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(new GreaterConstraint(1));
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if (config.grab(kP)) {
k = kP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
new file mode 100644
index 00000000..2a60ef27
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansHybridLloydMacQueen.java
@@ -0,0 +1,155 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+
+/**
+ * Provides the k-means algorithm, alternating between MacQueen-style
+ * incremental processing and Lloyd-Style batch steps.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.landmark
+ * @apiviz.has KMeansModel
+ *
+ * @param <V> vector datatype
+ * @param <D> distance value type
+ */
+public class KMeansHybridLloydMacQueen<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans<V, D, KMeansModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMeansHybridLloydMacQueen.class);
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Initialization method
+ */
+ public KMeansHybridLloydMacQueen(PrimitiveDistanceFunction<NumberVector<?>, D> distanceFunction, int k, int maxiter, KMeansInitialization<V> initializer) {
+ super(distanceFunction, k, maxiter, initializer);
+ }
+
+ @Override
+ public Clustering<KMeansModel<V>> run(Database database, Relation<V> relation) {
+ if (relation.size() <= 0) {
+ return new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ }
+ // Choose initial means
+ List<Vector> means = new ArrayList<>(k);
+ for (NumberVector<?> nv : initializer.chooseInitialMeans(database, relation, k, getDistanceFunction())) {
+ means.add(nv.getColumnVector());
+ }
+ // Setup cluster assignment store
+ List<ModifiableDBIDs> clusters = new ArrayList<>();
+ for (int i = 0; i < k; i++) {
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
+ }
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
+ for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration += 2) {
+ { // MacQueen
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ boolean changed = macQueenIterate(relation, means, clusters, assignment);
+ if (!changed) {
+ break;
+ }
+ }
+ { // Lloyd
+ if (prog != null) {
+ prog.incrementProcessed(LOG);
+ }
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment);
+ // Stop if no cluster assignment changed.
+ if (!changed) {
+ break;
+ }
+ // Recompute means.
+ means = means(clusters, means, relation);
+ }
+ }
+ if (prog != null) {
+ prog.setCompleted(LOG);
+ }
+
+ // Wrap result
+ final NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
+ Clustering<KMeansModel<V>> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
+ for (int i = 0; i < clusters.size(); i++) {
+ KMeansModel<V> model = new KMeansModel<>(factory.newNumberVector(means.get(i).getColumnVector().getArrayRef()));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>, D extends Distance<D>> extends AbstractKMeans.Parameterizer<V, D> {
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ @Override
+ protected KMeansHybridLloydMacQueen<V, D> makeInstance() {
+ return new KMeansHybridLloydMacQueen<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
index e692293c..686e2076 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansLloyd.java
@@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -93,15 +96,16 @@ public class KMeansLloyd<V extends NumberVector<?>, D extends Distance<D>> exten
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
if (prog != null) {
prog.incrementProcessed(LOG);
}
- boolean changed = assignToNearestCluster(relation, means, clusters);
+ boolean changed = assignToNearestCluster(relation, means, clusters, assignment);
// Stop if no cluster assignment changed.
if (!changed) {
break;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
index bb689bd3..a0f4bb3f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansMacQueen.java
@@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
@@ -95,11 +98,9 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
// Initialize cluster and assign objects
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
- assignToNearestCluster(relation, means, clusters);
- // Initial recomputation of the means.
- means = means(clusters, means, relation);
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
// Refine result
@@ -107,7 +108,7 @@ public class KMeansMacQueen<V extends NumberVector<?>, D extends Distance<D>> ex
if (prog != null) {
prog.incrementProcessed(LOG);
}
- boolean changed = macQueenIterate(relation, means, clusters);
+ boolean changed = macQueenIterate(relation, means, clusters, assignment);
if (!changed) {
break;
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
index 302ca86b..6fc514eb 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMeansPlusPlusInitialMeans.java
@@ -84,8 +84,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
List<V> means = new ArrayList<>(k);
- Random random = rnd.getRandom();
- DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, new Random(random.nextLong())).iter());
+ Random random = rnd.getSingleThreadedRandom();
+ DBID first = DBIDUtil.deref(DBIDUtil.randomSample(relation.getDBIDs(), 1, random).iter());
means.add(relation.get(first));
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
@@ -134,8 +134,8 @@ public class KMeansPlusPlusInitialMeans<V, D extends NumberDistance<D, ?>> exten
// Chose first mean
ArrayModifiableDBIDs means = DBIDUtil.newArray(k);
- Random random = rnd.getRandom();
- DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, new Random(random.nextLong())).iter());
+ Random random = rnd.getSingleThreadedRandom();
+ DBID first = DBIDUtil.deref(DBIDUtil.randomSample(distQ.getRelation().getDBIDs(), 1, random).iter());
means.add(first);
ArrayDBIDs ids = DBIDUtil.ensureArray(distQ.getRelation().getDBIDs());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
index cc7aaa9e..0a97c4d3 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMediansLloyd.java
@@ -31,6 +31,9 @@ import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.MeanModel;
import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
@@ -88,15 +91,16 @@ public class KMediansLloyd<V extends NumberVector<?>, D extends Distance<D>> ext
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
- clusters.add(DBIDUtil.newHashSet(relation.size() / k));
+ clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
if (prog != null) {
prog.incrementProcessed(LOG);
}
- boolean changed = assignToNearestCluster(relation, medians, clusters);
+ boolean changed = assignToNearestCluster(relation, medians, clusters, assignment);
// Stop if no cluster assignment changed.
if (!changed) {
break;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
index 87a0c7ae..41cca225 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsEM.java
@@ -48,8 +48,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.Mean;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -119,7 +118,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
@@ -127,7 +126,7 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
Mean[] mdists = Mean.newArray(k);
@@ -139,47 +138,47 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids iteration", LOG) : null;
// Swap phase
boolean changed = true;
- while (changed) {
- if (prog != null) {
+ while(changed) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
- for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
DBID best = null;
Mean bestm = mdists[i];
- for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
- if (DBIDUtil.equal(miter, iter)) {
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(DBIDUtil.equal(miter, iter)) {
continue;
}
Mean mdist = new Mean();
- for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
mdist.put(distQ.distance(iter, iter2).doubleValue());
}
- if (mdist.getMean() < bestm.getMean()) {
+ if(mdist.getMean() < bestm.getMean()) {
best = DBIDUtil.deref(iter);
bestm = mdist;
}
}
- if (best != null && !DBIDUtil.equal(miter, best)) {
+ if(best != null && !DBIDUtil.equal(miter, best)) {
changed = true;
medoids.set(i, best);
mdists[i] = bestm;
}
}
// Reassign
- if (changed) {
+ if(changed) {
assignToNearestCluster(medoids, mdists, clusters, distQ);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.setCompleted(LOG);
}
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < clusters.size(); i++) {
MedoidModel model = new MedoidModel(medoids.get(i));
result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
@@ -200,27 +199,27 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
boolean changed = false;
double[] dists = new double[k];
- for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
int minIndex = 0;
double mindist = Double.POSITIVE_INFINITY;
{
int i = 0;
- for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
dists[i] = distQ.distance(iditer, miter).doubleValue();
- if (dists[i] < mindist) {
+ if(dists[i] < mindist) {
minIndex = i;
mindist = dists[i];
}
}
}
- if (clusters.get(minIndex).add(iditer)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
mdist[minIndex].put(mindist);
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
mdist[minIndex].put(dists[i], -1);
break;
}
@@ -259,19 +258,19 @@ public class KMedoidsEM<V, D extends NumberDistance<D, ?>> extends AbstractDista
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
index 1feda867..c9e1dc47 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -53,8 +53,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -124,7 +123,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
- if (relation.size() <= 0) {
+ if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
@@ -133,7 +132,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
@@ -145,8 +144,8 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
// Swap phase
boolean changed = true;
- while (changed) {
- if (prog != null) {
+ while(changed) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
changed = false;
@@ -155,57 +154,60 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
DBID bestid = null;
int bestcluster = -1;
int i = 0;
- for (DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
- for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
- if (DBIDUtil.equal(miter, iter)) {
+ for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
+ if(DBIDUtil.equal(miter, iter)) {
continue;
}
// double disti = distQ.distance(id, med).doubleValue();
double cost = 0;
DBIDIter olditer = medoids.iter();
- for (int j = 0; j < k; j++, olditer.advance()) {
- for (DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
+ for(int j = 0; j < k; j++, olditer.advance()) {
+ for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
double distcur = distQ.distance(iter2, olditer).doubleValue();
double distnew = distQ.distance(iter2, iter).doubleValue();
- if (j == i) {
+ if(j == i) {
// Cases 1 and 2.
double distsec = second.doubleValue(iter2);
- if (distcur > distsec) {
+ if(distcur > distsec) {
// Case 1, other would switch to a third medoid
cost += distsec - distcur; // Always positive!
- } else { // Would remain with the candidate
+ }
+ else { // Would remain with the candidate
cost += distnew - distcur; // Could be negative
}
- } else {
+ }
+ else {
// Cases 3-4: objects from other clusters
- if (distcur < distnew) {
+ if(distcur < distnew) {
// Case 3: no change
- } else {
+ }
+ else {
// Case 4: would switch to new medoid
cost += distnew - distcur; // Always negative
}
}
}
}
- if (cost < best) {
+ if(cost < best) {
best = cost;
bestid = DBIDUtil.deref(iter);
bestcluster = i;
}
}
}
- if (prog != null) {
+ if(prog != null) {
prog.setCompleted(LOG);
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Best cost: " + best);
}
- if (bestid != null) {
+ if(bestid != null) {
changed = true;
medoids.set(bestcluster, bestid);
}
// Reassign
- if (changed) {
+ if(changed) {
// TODO: can we save some of these recomputations?
assignToNearestCluster(medoids, ids, second, clusters, distQ);
}
@@ -213,7 +215,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
- for (int i = 0; i < clusters.size(); i++) {
+ for(int i = 0; i < clusters.size(); i++) {
MedoidModel model = new MedoidModel(medoids.get(i));
result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
@@ -234,30 +236,31 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
boolean changed = false;
- for (DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
int minIndex = 0;
double mindist = Double.POSITIVE_INFINITY;
double mindist2 = Double.POSITIVE_INFINITY;
{
int i = 0;
- for (DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
+ for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
double dist = distQ.distance(iditer, miter).doubleValue();
- if (dist < mindist) {
+ if(dist < mindist) {
minIndex = i;
mindist2 = mindist;
mindist = dist;
- } else if (dist < mindist2) {
+ }
+ else if(dist < mindist2) {
mindist2 = dist;
}
}
}
- if (clusters.get(minIndex).add(iditer)) {
+ if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for (int i = 0; i < k; i++) {
- if (i != minIndex) {
- if (clusters.get(i).remove(iditer)) {
+ for(int i = 0; i < k; i++) {
+ if(i != minIndex) {
+ if(clusters.get(i).remove(iditer)) {
break;
}
}
@@ -296,19 +299,19 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
- if (config.grab(initialP)) {
+ if(config.grab(initialP)) {
initializer = initialP.instantiateClass(config);
}
IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0);
- maxiterP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(maxiterP)) {
+ maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
maxiter = maxiterP.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
index ee90e0dc..1329132e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/RandomlyGeneratedInitialMeans.java
@@ -60,7 +60,7 @@ public class RandomlyGeneratedInitialMeans<V extends NumberVector<?>> extends Ab
NumberVector.Factory<V, ?> factory = RelationUtil.getNumberVectorFactory(relation);
Pair<V, V> minmax = DatabaseUtil.computeMinMax(relation);
List<V> means = new ArrayList<>(k);
- final Random random = rnd.getRandom();
+ final Random random = rnd.getSingleThreadedRandom();
for(int i = 0; i < k; i++) {
double[] r = MathUtil.randomDoubleArray(dim, random);
// Rescale
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
index 9f0a1923..79013364 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/SampleKMeansInitialization.java
@@ -93,7 +93,7 @@ public class SampleKMeansInitialization<V extends NumberVector<?>, D extends Dis
Clustering<? extends MeanModel<V>> clusters = innerkMeans.run(proxydb, proxyv);
List<V> means = new ArrayList<>();
for (Cluster<? extends MeanModel<V>> cluster : clusters.getAllClusters()) {
- means.add((V) cluster.getModel().getMean());
+ means.add(cluster.getModel().getMean());
}
return means;
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
index ed9a528d..1be19bd1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/quality/package-info.java
@@ -1,4 +1,27 @@
/**
* Quality measures for k-Means results.
*/
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.quality; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java
new file mode 100644
index 00000000..55114f7d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/KNNKernelDensityMinimaClustering.java
@@ -0,0 +1,384 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction;
+import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.QuickSelect;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Cluster one-dimensional data by splitting the data set on local minima after
+ * performing kernel density estimation.
+ *
+ * @author Erich Schubert
+ */
+public class KNNKernelDensityMinimaClustering<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<ClusterModel>> implements ClusteringAlgorithm<Clustering<ClusterModel>> {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(KNNKernelDensityMinimaClustering.class);
+
+ /**
+ * Estimation mode.
+ *
+ * @apiviz.exclude
+ */
+ public static enum Mode {
+ BALLOON, // Balloon estimator
+ SAMPLE, // Sample-point estimator
+ }
+
+ /**
+ * Dimension to use for clustering.
+ */
+ protected int dim;
+
+ /**
+ * Kernel density function.
+ */
+ protected KernelDensityFunction kernel;
+
+ /**
+ * Estimation modes.
+ */
+ protected Mode mode;
+
+ /**
+ * Number of neighbors to use for bandwidth.
+ */
+ protected int k;
+
+ /**
+ * Window width, for local minima criterions.
+ */
+ protected int minwindow;
+
+ /**
+ * Constructor.
+ *
+ * @param dim Dimension to use for clustering
+ * @param kernel Kernel function
+ * @param mode Bandwidth mode
+ * @param k Number of neighbors
+ * @param minwindow Window size for comparison
+ */
+ public KNNKernelDensityMinimaClustering(int dim, KernelDensityFunction kernel, Mode mode, int k, int minwindow) {
+ super();
+ this.dim = dim;
+ this.kernel = kernel;
+ this.mode = mode;
+ this.k = k;
+ this.minwindow = minwindow;
+ }
+
+ /**
+ * Run the clustering algorithm on a data relation.
+ *
+ * @param relation Relation
+ * @return Clustering result
+ */
+ public Clustering<ClusterModel> run(Relation<V> relation) {
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ final int size = ids.size();
+
+ // Sort by the sole dimension
+ ids.sort(new VectorUtil.SortDBIDsBySingleDimension(relation, dim));
+
+ // Density storage.
+ WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
+
+ DBIDArrayIter iter = ids.iter(), iter2 = ids.iter();
+
+ StepProgress sprog = LOG.isVerbose() ? new StepProgress("Clustering steps", 2) : null;
+
+ if(sprog != null) {
+ sprog.beginStep(1, "Kernel density estimation.", LOG);
+ }
+ {
+ double[] scratch = new double[2 * k];
+ iter.seek(0);
+ for(int i = 0; i < size; i++, iter.advance()) {
+ // Current value.
+ final double curv = relation.get(iter).doubleValue(dim);
+
+ final int pre = Math.max(i - k, 0), prek = i - pre;
+ final int pos = Math.min(i + k, size - 1), posk = pos - i;
+ iter2.seek(pre);
+ for(int j = 0; j < prek; j++, iter2.advance()) {
+ scratch[j] = curv - relation.get(iter2).doubleValue(dim);
+ }
+ assert (iter2.getOffset() == i);
+ iter2.advance();
+ for(int j = 0; j < posk; j++, iter2.advance()) {
+ scratch[prek + j] = relation.get(iter2).doubleValue(dim) - curv;
+ }
+
+ assert (prek + posk >= k);
+ double kdist = QuickSelect.quickSelect(scratch, 0, prek + posk, k);
+ switch(mode){
+ case BALLOON: {
+ double dens = 0.;
+ if(kdist > 0.) {
+ for(int j = 0; j < prek + posk; j++) {
+ dens += kernel.density(scratch[j] / kdist);
+ }
+ }
+ else {
+ dens = Double.POSITIVE_INFINITY;
+ }
+ assert (iter.getOffset() == i);
+ density.putDouble(iter, dens);
+ break;
+ }
+ case SAMPLE: {
+ if(kdist > 0.) {
+ iter2.seek(pre);
+ for(int j = 0; j < prek; j++, iter2.advance()) {
+ double delta = curv - relation.get(iter2).doubleValue(dim);
+ density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
+ }
+ assert (iter2.getOffset() == i);
+ iter2.advance();
+ for(int j = 0; j < posk; j++, iter2.advance()) {
+ double delta = relation.get(iter2).doubleValue(dim) - curv;
+ density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
+ }
+ }
+ else {
+ iter2.seek(pre);
+ for(int j = 0; j < prek; j++, iter2.advance()) {
+ double delta = curv - relation.get(iter2).doubleValue(dim);
+ if(!(delta > 0.)) {
+ density.putDouble(iter2, Double.POSITIVE_INFINITY);
+ }
+ }
+ assert (iter2.getOffset() == i);
+ iter2.advance();
+ for(int j = 0; j < posk; j++, iter2.advance()) {
+ double delta = relation.get(iter2).doubleValue(dim) - curv;
+ if(!(delta > 0.)) {
+ density.putDouble(iter2, Double.POSITIVE_INFINITY);
+ }
+ }
+ }
+ break;
+ }
+ default:
+ throw new UnsupportedOperationException("Unknown mode specified.");
+ }
+ }
+ }
+
+ if(sprog != null) {
+ sprog.beginStep(2, "Local minima detection.", LOG);
+ }
+ Clustering<ClusterModel> clustering = new Clustering<>("onedimensional-kde-clustering", "One-Dimensional clustering using kernel density estimation.");
+ {
+ double[] scratch = new double[2 * minwindow + 1];
+ int begin = 0;
+ int halfw = (minwindow + 1) >> 1;
+ iter.seek(0);
+ // Fill initial buffer.
+ for(int i = 0; i < size; i++, iter.advance()) {
+ final int m = i % scratch.length, t = (i - minwindow - 1) % scratch.length;
+ scratch[m] = density.doubleValue(iter);
+ if(i > scratch.length) {
+ double min = Double.POSITIVE_INFINITY;
+ for(int j = 0; j < scratch.length; j++) {
+ if(j != t && scratch[j] < min) {
+ min = scratch[j];
+ }
+ }
+ // Local minimum:
+ if(scratch[t] < min) {
+ int end = i - minwindow + 1;
+ { // Test on which side the kNN is
+ iter2.seek(end);
+ double curv = relation.get(iter2).doubleValue(dim);
+ iter2.seek(end - halfw);
+ double left = relation.get(iter2).doubleValue(dim) - curv;
+ iter2.seek(end + halfw);
+ double right = curv - relation.get(iter2).doubleValue(dim);
+ if(left < right) {
+ end++;
+ }
+ }
+ iter2.seek(begin);
+ ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
+ for(int j = 0; j < end - begin; j++, iter2.advance()) {
+ cids.add(iter2);
+ }
+ clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
+ begin = end;
+ }
+ }
+ }
+ // Extract last cluster
+ int end = size;
+ iter2.seek(begin);
+ ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
+ for(int j = 0; j < end - begin; j++, iter2.advance()) {
+ cids.add(iter2);
+ }
+ clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
+ }
+
+ if(sprog != null) {
+ sprog.setCompleted(LOG);
+ }
+ return clustering;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(new VectorFieldTypeInformation<>(NumberVector.class, dim + 1, Integer.MAX_VALUE));
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Dimension to use for clustering.
+ */
+ public static final OptionID DIM_ID = new OptionID("kernelcluster.dim", "Dimension to use for clustering. For one-dimensional data, use 0.");
+
+ /**
+ * Kernel function.
+ */
+ public static final OptionID KERNEL_ID = new OptionID("kernelcluster.kernel", "Kernel function for density estimation.");
+
+ /**
+ * KDE mode.
+ */
+ public static final OptionID MODE_ID = new OptionID("kernelcluster.mode", "Kernel density estimation mode (baloon estimator vs. sample point estimator).");
+
+ /**
+ * Number of neighbors for bandwidth estimation.
+ */
+ public static final OptionID K_ID = new OptionID("kernelcluster.knn", "Number of nearest neighbors to use for bandwidth estimation.");
+
+ /**
+ * Half window width to find local minima.
+ */
+ public static final OptionID WINDOW_ID = new OptionID("kernelcluster.window", "Half width of sliding window to find local minima.");
+
+ /**
+ * Dimension to use for clustering.
+ */
+ protected int dim;
+
+ /**
+ * Kernel density function.
+ */
+ protected KernelDensityFunction kernel;
+
+ /**
+ * Estimation modes.
+ */
+ protected Mode mode;
+
+ /**
+ * Number of neighbors to use for bandwidth.
+ */
+ protected int k;
+
+ /**
+ * Window width, for local minima criterions.
+ */
+ protected int minwindow;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter dimP = new IntParameter(DIM_ID, 0);
+ dimP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(dimP)) {
+ dim = dimP.intValue();
+ }
+
+ ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, EpanechnikovKernelDensityFunction.class);
+ if(config.grab(kernelP)) {
+ kernel = kernelP.instantiateClass(config);
+ }
+
+ EnumParameter<Mode> modeP = new EnumParameter<>(MODE_ID, Mode.class, Mode.BALLOON);
+ if(config.grab(modeP)) {
+ mode = modeP.getValue();
+ }
+
+ IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.intValue();
+ }
+
+ IntParameter windowP = new IntParameter(WINDOW_ID);
+ windowP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(windowP)) {
+ minwindow = windowP.intValue();
+ }
+ }
+
+ @Override
+ protected KNNKernelDensityMinimaClustering<V> makeInstance() {
+ return new KNNKernelDensityMinimaClustering<>(dim, kernel, mode, k, minwindow);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java
new file mode 100644
index 00000000..c6c55244
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/onedimensional/package-info.java
@@ -0,0 +1,27 @@
+/**
+ * Clustering algorithms for one-dimensional data.
+ */
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.algorithm.clustering.onedimensional; \ No newline at end of file
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
index db026e93..617d74cd 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/CLIQUE.java
@@ -56,8 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
@@ -594,14 +593,14 @@ public class CLIQUE<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter xsiP = new IntParameter(XSI_ID);
- xsiP.addConstraint(new GreaterConstraint(0));
+ xsiP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(xsiP)) {
xsi = xsiP.intValue();
}
DoubleParameter tauP = new DoubleParameter(TAU_ID);
- tauP.addConstraint(new GreaterConstraint(0));
- tauP.addConstraint(new LessConstraint(1));
+ tauP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ tauP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
if(config.grab(tauP)) {
tau = tauP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
new file mode 100644
index 00000000..5f798a66
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DOC.java
@@ -0,0 +1,605 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceMaximumDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * <p>
+ * Provides the DOC algorithm, and it's heuristic variant, FastDOC. DOC is a
+ * sampling based subspace clustering algorithm.
+ * </p>
+ *
+ * <p>
+ * Reference: <br/>
+ * C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali<br />
+ * A Monte Carlo algorithm for fast projective clustering. <br/>
+ * In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02).
+ * </p>
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.has SubspaceModel
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("DOC: Density-based Optimal projective Clustering")
+@Reference(authors = "C. M. Procopiuc, M. Jones, P. K. Agarwal, T. M. Murali", title = "A Monte Carlo algorithm for fast projective clustering", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '02)", url = "http://dx.doi.org/10.1145/564691.564739")
+public class DOC<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(DOC.class);
+
+ /**
+ * Relative density threshold parameter alpha.
+ */
+ private double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ private double beta;
+
+ /**
+ * Half width parameter.
+ */
+ private double w;
+
+ /**
+ * Holds the value of {@link Parameterizer#HEURISTICS_ID}.
+ */
+ private boolean heuristics;
+
+ /**
+ * Holds the value of {@link Parameterizer#D_ZERO_ID}.
+ */
+ private int d_zero;
+
+ /**
+ * Randomizer used internally for sampling points.
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha &alpha; relative density threshold.
+ * @param beta &beta; balancing parameter for size vs. dimensionality.
+ * @param w <em>w</em> half width parameter.
+ * @param heuristics whether to use heuristics (FastDOC) or not.
+ * @param random Random factory
+ */
+ public DOC(double alpha, double beta, double w, boolean heuristics, int d_zero, RandomFactory random) {
+ this.alpha = alpha;
+ this.beta = beta;
+ this.w = w;
+ this.heuristics = heuristics;
+ this.d_zero = d_zero;
+ this.rnd = random;
+ }
+
+ /**
+ * Performs the DOC or FastDOC (as configured) algorithm on the given
+ * Database.
+ *
+ * <p>
+ * This will run exhaustively, i.e. run DOC until no clusters are found
+ * anymore / the database size has shrunk below the threshold for minimum
+ * cluster size.
+ * </p>
+ *
+ * @param database Database
+ * @param relation Data relation
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ // Dimensionality of our set.
+ final int d = RelationUtil.dimensionality(relation);
+
+ // Get available DBIDs as a set we can remove items from.
+ ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
+
+ // Precompute values as described in Figure 2.
+ double r = Math.abs(Math.log(d + d) / Math.log(beta * .5));
+ // Outer loop count.
+ int n = (int) (2. / alpha);
+ // Inner loop count.
+ int m = (int) (Math.pow(2. / alpha, r) * Math.log(4));
+ if(heuristics) {
+ m = Math.min(m, Math.min(1000000, d * d));
+ }
+
+ // Minimum size for a cluster for it to be accepted.
+ int minClusterSize = (int) (alpha * S.size());
+
+ // List of all clusters we found.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("DOC Clusters", "DOC");
+
+ // Inform the user about the number of actual clusters found so far.
+ IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
+
+ // To not only find a single cluster, we continue running until our set
+ // of points is empty.
+ while(S.size() > minClusterSize) {
+ Cluster<SubspaceModel<V>> C;
+ if(heuristics) {
+ C = runFastDOC(relation, S, d, n, m, (int) r);
+ }
+ else {
+ C = runDOC(relation, S, d, n, m, (int) r, minClusterSize);
+ }
+
+ if(C == null) {
+ // Stop trying if we couldn't find a cluster.
+ break;
+ }
+ // Found a cluster, remember it, remove its points from the set.
+ result.addToplevelCluster(C);
+
+ // Remove all points of the cluster from the set and continue.
+ S.removeDBIDs(C.getIDs());
+
+ if(cprogress != null) {
+ cprogress.setProcessed(result.getAllClusters().size(), LOG);
+ }
+ }
+
+ // Add the remainder as noise.
+ if(S.size() > 0) {
+ BitSet alldims = new BitSet();
+ alldims.set(0, d);
+ result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel<>(new Subspace(alldims), Centroid.make(relation, S).toVector(relation))));
+ }
+
+ if(cprogress != null) {
+ cprogress.setCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Performs a single run of DOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @param minClusterSize Minimum size a cluster must have to be accepted.
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runDOC(Relation<V> relation, ArrayModifiableDBIDs S, final int d, int n, int m, int r, int minClusterSize) {
+ final DoubleDistance wd = new DoubleDistance(w);
+ // Best cluster for the current run.
+ DBIDs C = null;
+ // Relevant attributes for the best cluster.
+ BitSet D = null;
+ // Quality of the best cluster.
+ double quality = Double.NEGATIVE_INFINITY;
+
+ // Bounds for our cluster.
+ // ModifiableHyperBoundingBox bounds = new ModifiableHyperBoundingBox(new
+ // double[d], new double[d]);
+
+ // Weights for distance (= rectangle query)
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(new BitSet(d));
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq);
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+ DBIDArrayIter iter = S.iter();
+
+ for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension and build bounding box.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+ if(nD.cardinality() > 0) {
+ // Get all points in the box.
+ df.setSelectedDimensions(nD);
+ // TODO: add filtering capabilities into query API!
+ DBIDs nC = DBIDUtil.intersection(S, rq.getRangeForDBID(iter, wd));
+
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("Testing a cluster candidate, |C| = " + nC.size() + ", |D| = " + nD.cardinality());
+ }
+
+ // Is the cluster large enough?
+ if(nC.size() < minClusterSize) {
+ // Too small.
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but it's too small.");
+ }
+ }
+ else {
+ // Better cluster than before?
+ double nQuality = computeClusterQuality(nC.size(), nD.cardinality());
+ if(nQuality > quality) {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... and it's the best so far: " + nQuality + " vs. " + quality);
+ }
+ C = nC;
+ D = nD;
+ quality = nQuality;
+ }
+ else {
+ if(LOG.isDebuggingFiner()) {
+ LOG.finer("... but we already have a better one.");
+ }
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ if(C != null) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Performs a single run of FastDOC, finding a single cluster.
+ *
+ * @param relation used to get actual values for DBIDs.
+ * @param S The set of points we're working on.
+ * @param d Dimensionality of the data set we're currently working on.
+ * @param r Size of random samples.
+ * @param m Number of inner iterations (per seed point).
+ * @param n Number of outer iterations (seed points).
+ * @return a cluster, if one is found, else <code>null</code>.
+ */
+ private Cluster<SubspaceModel<V>> runFastDOC(Relation<V> relation, ArrayModifiableDBIDs S, int d, int n, int m, int r) {
+ // Relevant attributes of highest cardinality.
+ BitSet D = null;
+ // The seed point for the best dimensions.
+ DBIDVar dV = DBIDUtil.newVar();
+
+ // Inform the user about the progress in the current iteration.
+ FiniteProgress iprogress = LOG.isVerbose() ? new FiniteProgress("Iteration progress for current cluster", m * n, LOG) : null;
+
+ Random random = rnd.getSingleThreadedRandom();
+
+ DBIDArrayIter iter = S.iter();
+ outer: for(int i = 0; i < n; ++i) {
+ // Pick a random seed point.
+ iter.seek(random.nextInt(S.size()));
+
+ for(int j = 0; j < m; ++j) {
+ // Choose a set of random points.
+ DBIDs randomSet = DBIDUtil.randomSample(S, Math.min(S.size(), r), random);
+
+ // Initialize cluster info.
+ BitSet nD = new BitSet(d);
+
+ // Test each dimension.
+ for(int k = 0; k < d; ++k) {
+ if(dimensionIsRelevant(k, relation, randomSet)) {
+ nD.set(k);
+ }
+ }
+
+ if(D == null || nD.cardinality() > D.cardinality()) {
+ D = nD;
+ dV.set(iter);
+
+ if(D.cardinality() >= d_zero) {
+ if(iprogress != null) {
+ iprogress.setProcessed(iprogress.getTotal(), LOG);
+ }
+ break outer;
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.incrementProcessed(LOG);
+ }
+ }
+ }
+
+ if(iprogress != null) {
+ iprogress.ensureCompleted(LOG);
+ }
+
+ // If no relevant dimensions were found, skip it.
+ if(D == null || D.cardinality() == 0) {
+ return null;
+ }
+
+ // Get all points in the box.
+ SubspaceMaximumDistanceFunction df = new SubspaceMaximumDistanceFunction(D);
+ DistanceQuery<V, DoubleDistance> dq = relation.getDatabase().getDistanceQuery(relation, df);
+ RangeQuery<V, DoubleDistance> rq = relation.getDatabase().getRangeQuery(dq, DatabaseQuery.HINT_SINGLE);
+
+ // TODO: add filtering capabilities into query API!
+ DBIDs C = DBIDUtil.intersection(S, rq.getRangeForDBID(dV, new DoubleDistance(w)));
+
+ // If we have a non-empty cluster, return it.
+ if(C.size() > 0) {
+ return makeCluster(relation, C, D);
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Utility method to test if a given dimension is relevant as determined via a
+ * set of reference points (i.e. if the variance along the attribute is lower
+ * than the threshold).
+ *
+ * @param dimension the dimension to test.
+ * @param relation used to get actual values for DBIDs.
+ * @param points the points to test.
+ * @return <code>true</code> if the dimension is relevant.
+ */
+ private boolean dimensionIsRelevant(int dimension, Relation<V> relation, DBIDs points) {
+ double min = Double.POSITIVE_INFINITY;
+ double max = Double.NEGATIVE_INFINITY;
+ for(DBIDIter iter = points.iter(); iter.valid(); iter.advance()) {
+ V xV = relation.get(iter);
+ min = Math.min(min, xV.doubleValue(dimension));
+ max = Math.max(max, xV.doubleValue(dimension));
+ if(max - min > w) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Utility method to create a subspace cluster from a list of DBIDs and the
+ * relevant attributes.
+ *
+ * @param relation to compute a centroid.
+ * @param C the cluster points.
+ * @param D the relevant dimensions.
+ * @return an object representing the subspace cluster.
+ */
+ private Cluster<SubspaceModel<V>> makeCluster(Relation<V> relation, DBIDs C, BitSet D) {
+ DBIDs ids = DBIDUtil.newHashSet(C); // copy, also to lose distance values!
+ Cluster<SubspaceModel<V>> cluster = new Cluster<>(ids);
+ cluster.setModel(new SubspaceModel<>(new Subspace(D), Centroid.make(relation, ids).toVector(relation)));
+ return cluster;
+ }
+
+ /**
+ * Computes the quality of a cluster based on its size and number of relevant
+ * attributes, as described via the &mu;-function from the paper.
+ *
+ * @param clusterSize the size of the cluster.
+ * @param numRelevantDimensions the number of dimensions relevant to the
+ * cluster.
+ * @return a quality measure (only use this to compare the quality to that
+ * other clusters).
+ */
+ private double computeClusterQuality(int clusterSize, int numRelevantDimensions) {
+ return clusterSize * Math.pow(1. / beta, numRelevantDimensions);
+ }
+
+ // ---------------------------------------------------------------------- //
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ public static final OptionID ALPHA_ID = new OptionID("doc.alpha", "Minimum relative density for a set of points to be considered a cluster (|C|>=doc.alpha*|S|).");
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ public static final OptionID BETA_ID = new OptionID("doc.beta", "Preference of cluster size versus number of relevant dimensions (higher value means higher priority on larger clusters).");
+
+ /**
+ * Half width parameter.
+ */
+ public static final OptionID W_ID = new OptionID("doc.w", "Maximum extent of scattering of points along a single attribute for the attribute to be considered relevant.");
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ public static final OptionID HEURISTICS_ID = new OptionID("doc.fastdoc", "Use heuristics as described, thus using the FastDOC algorithm (not yet implemented).");
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ public static final OptionID D_ZERO_ID = new OptionID("doc.d0", "Parameter for FastDOC, setting the number of relevant attributes which, when found for a cluster, are deemed enough to stop iterating.");
+
+ /**
+ * Random seeding parameter.
+ */
+ public static final OptionID RANDOM_ID = new OptionID("doc.random-seed", "Random seed, for reproducible experiments.");
+
+ /**
+ * Relative density threshold parameter Alpha.
+ */
+ protected double alpha;
+
+ /**
+ * Balancing parameter for importance of points vs. dimensions
+ */
+ protected double beta;
+
+ /**
+ * Half width parameter.
+ */
+ protected double w;
+
+ /**
+ * Parameter to enable FastDOC heuristics.
+ */
+ protected boolean heuristics;
+
+ /**
+ * Stopping threshold for FastDOC.
+ */
+ protected int d_zero;
+
+ /**
+ * Random seeding factory.
+ */
+ protected RandomFactory random = RandomFactory.DEFAULT;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_ID, 0.2);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(BETA_ID, 0.8);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(param)) {
+ beta = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(W_ID, 0.05);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ w = param.getValue();
+ }
+ }
+
+ {
+ Flag param = new Flag(HEURISTICS_ID);
+ if(config.grab(param)) {
+ heuristics = param.getValue();
+ }
+ }
+
+ if(heuristics) {
+ IntParameter param = new IntParameter(D_ZERO_ID, 5);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ d_zero = param.getValue();
+ }
+ }
+
+ {
+ RandomParameter param = new RandomParameter(RANDOM_ID);
+ if(config.grab(param)) {
+ random = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected DOC<V> makeInstance() {
+ return new DOC<>(alpha, beta, w, heuristics, d_zero, random);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
index b17ebebb..cd5e51b8 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/DiSH.java
@@ -69,8 +69,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -170,12 +169,12 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
*/
public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
// Instantiate DiSH distance (and thus run the preprocessor)
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("*** Run DiSH preprocessor.");
}
DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation);
// Configure and run OPTICS.
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("*** Run OPTICS algorithm.");
}
ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters);
@@ -186,7 +185,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
optics = opticsconfig.tryInstantiate(cls);
ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("*** Compute Clusters.");
}
return computeClusters(relation, opticsResult, dishDistanceQuery);
@@ -206,10 +205,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// extract clusters
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 1: extract clusters");
- for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
+ for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
}
}
@@ -218,10 +217,10 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// check if there are clusters < minpts
checkClusters(database, distFunc, clustersMap, minpts);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 2: check clusters");
- for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
+ for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
}
}
@@ -230,9 +229,9 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// sort the clusters
List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 3: sort clusters");
- for (Cluster<SubspaceModel<V>> c : clusters) {
+ for(Cluster<SubspaceModel<V>> c : clusters) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size());
}
LOG.verbose(msg.toString());
@@ -241,14 +240,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// build the hierarchy
Clustering<SubspaceModel<V>> clustering = new Clustering<>("DiSH clustering", "dish-clustering");
buildHierarchy(database, distFunc, clustering, clusters, dimensionality);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
StringBuilder msg = new StringBuilder("Step 4: build hierarchy");
- for (Cluster<SubspaceModel<V>> c : clusters) {
+ for(Cluster<SubspaceModel<V>> c : clusters) {
msg.append('\n').append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size());
- for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterParents(c); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
- for (Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
+ for(Iter<Cluster<SubspaceModel<V>>> iter = clustering.getClusterHierarchy().iterChildren(c); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
@@ -256,8 +255,8 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
// build result
- for (Cluster<SubspaceModel<V>> c : clusters) {
- if (clustering.getClusterHierarchy().numParents(c) == 0) {
+ for(Cluster<SubspaceModel<V>> c : clusters) {
+ if(clustering.getClusterHierarchy().numParents(c) == 0) {
clustering.addToplevelCluster(c);
}
}
@@ -278,7 +277,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<>();
Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<>();
Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<>();
- for (Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
+ for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next();
entryMap.put(entry.getID(), entry);
@@ -287,43 +286,43 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
// get the list of (parallel) clusters for the preference vector
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector);
- if (parallelClusters == null) {
+ if(parallelClusters == null) {
parallelClusters = new ArrayList<>();
clustersMap.put(preferenceVector, parallelClusters);
}
// look for the proper cluster
Pair<BitSet, ArrayModifiableDBIDs> cluster = null;
- for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
V c_centroid = ProjectedCentroid.make(c.first, database, c.second).toVector(database);
PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector);
- if (dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
+ if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector());
- if (d <= 2 * epsilon) {
+ if(d <= 2 * epsilon) {
cluster = c;
break;
}
}
}
- if (cluster == null) {
+ if(cluster == null) {
cluster = new Pair<>(preferenceVector, DBIDUtil.newArray());
parallelClusters.add(cluster);
}
cluster.second.add(entry.getID());
entryToClusterMap.put(entry.getID(), cluster);
- if (progress != null) {
+ if(progress != null) {
progress.setProcessed(++processed, LOG);
}
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
- if (LOG.isDebuggingFiner()) {
+ if(LOG.isDebuggingFiner()) {
StringBuilder msg = new StringBuilder("Step 0");
- for (List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
- for (Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
+ for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
msg.append('\n').append(FormatUtil.format(RelationUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size());
}
}
@@ -331,24 +330,24 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
// add the predecessor to the cluster
- for (BitSet pv : clustersMap.keySet()) {
+ for(BitSet pv : clustersMap.keySet()) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for (Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
- if (cluster.second.isEmpty()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
+ if(cluster.second.isEmpty()) {
continue;
}
DBID firstID = cluster.second.get(0);
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID);
DBID predecessorID = entry.getPredecessorID();
- if (predecessorID == null) {
+ if(predecessorID == null) {
continue;
}
ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID);
// parallel cluster
- if (predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
+ if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
continue;
}
- if (predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
+ if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
continue;
}
@@ -375,16 +374,17 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
final int db_dim = RelationUtil.dimensionality(database);
// int num = 1;
List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<>();
- for (BitSet pv : clustersMap.keySet()) {
+ for(BitSet pv : clustersMap.keySet()) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for (int i = 0; i < parallelClusters.size(); i++) {
+ for(int i = 0; i < parallelClusters.size(); i++) {
Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i);
Cluster<SubspaceModel<V>> cluster = new Cluster<>(c.second);
cluster.setModel(new SubspaceModel<>(new Subspace(c.first), Centroid.make(database, c.second).toVector(database)));
String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, "");
- if (parallelClusters.size() > 1) {
+ if(parallelClusters.size() > 1) {
cluster.setName("Cluster_" + subspace + "_" + i);
- } else {
+ }
+ else {
cluster.setName("Cluster_" + subspace);
}
clusters.add(cluster);
@@ -417,11 +417,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<>();
Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<>();
Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<>(new BitSet(), DBIDUtil.newArray());
- for (BitSet pv : clustersMap.keySet()) {
+ for(BitSet pv : clustersMap.keySet()) {
// noise
- if (pv.cardinality() == 0) {
+ if(pv.cardinality() == 0) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
- for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
noise.second.addDBIDs(c.second);
}
}
@@ -429,10 +429,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
else {
List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<>(parallelClusters.size());
- for (Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
- if (!pv.equals(new BitSet()) && c.second.size() < minpts) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
+ if(!pv.equals(new BitSet()) && c.second.size() < minpts) {
notAssigned.add(c);
- } else {
+ }
+ else {
newParallelClusters.add(c);
}
}
@@ -443,14 +444,15 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
clustersMap.clear();
clustersMap.putAll(newClustersMap);
- for (Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
- if (c.second.isEmpty()) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
+ if(c.second.isEmpty()) {
continue;
}
Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap);
- if (parent != null) {
+ if(parent != null) {
parent.second.addDBIDs(c.second);
- } else {
+ }
+ else {
noise.second.addDBIDs(c.second);
}
}
@@ -477,23 +479,23 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
BitSet childPV = child.first;
int childCardinality = childPV.cardinality();
- for (BitSet parentPV : clustersMap.keySet()) {
+ for(BitSet parentPV : clustersMap.keySet()) {
int parentCardinality = parentPV.cardinality();
- if (parentCardinality >= childCardinality) {
+ if(parentCardinality >= childCardinality) {
continue;
}
- if (resultCardinality != -1 && parentCardinality <= resultCardinality) {
+ if(resultCardinality != -1 && parentCardinality <= resultCardinality) {
continue;
}
BitSet pv = (BitSet) childPV.clone();
pv.and(parentPV);
- if (pv.equals(parentPV)) {
+ if(pv.equals(parentPV)) {
List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV);
- for (Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
+ for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
V parent_centroid = ProjectedCentroid.make(parentPV, database, parent.second).toVector(database);
double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV);
- if (d <= 2 * epsilon) {
+ if(d <= 2 * epsilon) {
result = parent;
resultCardinality = parentCardinality;
break;
@@ -519,57 +521,59 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
final int db_dim = RelationUtil.dimensionality(database);
Hierarchy<Cluster<SubspaceModel<V>>> hier = clustering.getClusterHierarchy();
- for (int i = 0; i < clusters.size() - 1; i++) {
+ for(int i = 0; i < clusters.size() - 1; i++) {
Cluster<SubspaceModel<V>> c_i = clusters.get(i);
int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality();
V ci_centroid = ProjectedCentroid.make(c_i.getModel().getDimensions(), database, c_i.getIDs()).toVector(database);
- for (int j = i + 1; j < clusters.size(); j++) {
+ for(int j = i + 1; j < clusters.size(); j++) {
Cluster<SubspaceModel<V>> c_j = clusters.get(j);
int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality();
- if (subspaceDim_i < subspaceDim_j) {
- if (LOG.isDebugging()) {
+ if(subspaceDim_i < subspaceDim_j) {
+ if(LOG.isDebugging()) {
msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append(']');
msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append(']');
}
// noise level reached
- if (c_j.getModel().getSubspace().dimensionality() == 0) {
+ if(c_j.getModel().getSubspace().dimensionality() == 0) {
// no parents exists -> parent is noise
- if (hier.numParents(c_i) == 0) {
+ if(hier.numParents(c_i) == 0) {
clustering.addChildCluster(c_j, c_i);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
msg.append(']');
}
}
- } else {
+ }
+ else {
V cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs()).toVector(database);
PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions());
double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector());
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
msg.append("\n dist = ").append(distance.getCorrelationValue());
}
- if (distance.getCorrelationValue() == subspaceDim_j) {
- if (LOG.isDebugging()) {
+ if(distance.getCorrelationValue() == subspaceDim_j) {
+ if(LOG.isDebugging()) {
msg.append("\n d = ").append(d);
}
- if (d <= 2 * epsilon) {
+ if(d <= 2 * epsilon) {
// no parent exists or c_j is not a parent of the already
// existing parents
- if (hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
+ if(hier.numParents(c_i) == 0 || !isParent(database, distFunc, c_j, hier.iterParents(c_i))) {
clustering.addChildCluster(c_j, c_i);
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
msg.append("] is parent of [");
msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
msg.append(']');
}
}
- } else {
+ }
+ else {
throw new RuntimeException("Should never happen: d = " + d);
}
}
@@ -577,7 +581,7 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
}
}
}
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug(msg.toString());
}
}
@@ -599,11 +603,11 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
int dimensionality = RelationUtil.dimensionality(database);
int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality();
- for (; iter.valid(); iter.advance()) {
+ for(; iter.valid(); iter.advance()) {
Cluster<SubspaceModel<V>> child = iter.get();
V child_centroid = ProjectedCentroid.make(child.getModel().getDimensions(), database, child.getIDs()).toVector(database);
PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions());
- if (distance.getCorrelationValue() == subspaceDim_parent) {
+ if(distance.getCorrelationValue() == subspaceDim_parent) {
return true;
}
}
@@ -642,14 +646,14 @@ public class DiSH<V extends NumberVector<?>> extends AbstractAlgorithm<Clusterin
super.makeOptions(config);
DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, 0.001);
- epsilonP.addConstraint(new GreaterEqualConstraint(0));
- if (config.grab(epsilonP)) {
+ epsilonP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+ if(config.grab(epsilonP)) {
epsilon = epsilonP.doubleValue();
}
IntParameter muP = new IntParameter(MU_ID, 1);
- muP.addConstraint(new GreaterConstraint(0));
- if (config.grab(muP)) {
+ muP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(muP)) {
mu = muP.intValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
index 9ac7c072..3f135564 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/HiSC.java
@@ -34,8 +34,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
@@ -95,8 +94,8 @@ public class HiSC<V extends NumberVector<?>> extends OPTICS<V, PreferenceVectorB
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter alphaP = new DoubleParameter(HiSCPreferenceVectorIndex.Factory.ALPHA_ID, HiSCPreferenceVectorIndex.Factory.DEFAULT_ALPHA);
- alphaP.addConstraint(new GreaterConstraint(0.0));
- alphaP.addConstraint(new LessConstraint(1.0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ alphaP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
double alpha = 0.0;
if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
new file mode 100644
index 00000000..9d1ee94d
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/P3C.java
@@ -0,0 +1,1000 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.EM;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.Subspace;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.SetDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.MutableProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.VMath;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.PoissonDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * P3C: A Robust Projected Clustering Algorithm.
+ *
+ * <p>
+ * Reference: <br/>
+ * Gabriela Moise, Jörg Sander, Martin Ester<br />
+ * P3C: A Robust Projected Clustering Algorithm.<br/>
+ * In: Proc. Sixth International Conference on Data Mining (ICDM '06)
+ * </p>
+ *
+ * This is not a complete implementation of P3C, but good enough for most users.
+ * Improvements are welcome. The most obviously missing step is section 3.5 of
+ * P3C, where the cluster subspaces are refined.
+ *
+ * @author Florian Nuecke
+ * @author Erich Schubert
+ *
+ * @apiviz.uses EM
+ * @apiviz.has SubspaceModel
+ * @apiviz.has ClusterCandidate
+ * @apiviz.has Signature
+ *
+ * @param <V> the type of NumberVector handled by this Algorithm.
+ */
+@Title("P3C: A Robust Projected Clustering Algorithm.")
+@Reference(authors = "Gabriela Moise, Jörg Sander, Martin Ester", title = "P3C: A Robust Projected Clustering Algorithm", booktitle = "Proc. Sixth International Conference on Data Mining (ICDM '06)", url = "http://dx.doi.org/10.1109/ICDM.2006.123")
+public class P3C<V extends NumberVector<?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements SubspaceClusteringAlgorithm<SubspaceModel<V>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(P3C.class);
+
+ /**
+ * Parameter for the Poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existing in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ /**
+ * Alpha threshold for testing.
+ */
+ protected double alpha = 0.001;
+
+ /**
+ * Constructor.
+ *
+ * @param alpha ChiSquared test threshold
+ * @param poissonThreshold Poisson test threshold
+ * @param maxEmIterations Maximum number of EM iterations
+ * @param emDelta EM stopping threshold
+ * @param minClusterSize Minimum cluster size
+ */
+ public P3C(double alpha, double poissonThreshold, int maxEmIterations, double emDelta, int minClusterSize) {
+ super();
+ this.alpha = alpha;
+ this.poissonThreshold = poissonThreshold;
+ this.maxEmIterations = maxEmIterations;
+ this.emDelta = emDelta;
+ this.minClusterSize = minClusterSize;
+ }
+
+ /**
+ * Performs the P3C algorithm on the given Database.
+ */
+ public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
+ final int dim = RelationUtil.dimensionality(relation);
+
+ // Overall progress.
+ StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
+ }
+
+ // Desired number of bins, as per Sturge:
+ final int binCount = (int) Math.ceil(1 + (Math.log(relation.size()) / MathUtil.LOG2));
+
+ // Perform 1-dimensional projections, and split into bins.
+ SetDBIDs[][] partitions = partitionData(relation, binCount);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
+ }
+
+ // Set markers for each attribute until they're all deemed uniform.
+ final long[][] markers = new long[dim][];
+ int numuniform = 0;
+ for(int d = 0; d < dim; d++) {
+ final SetDBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d] = BitsUtil.zero(binCount);
+ int card = 0;
+ while(card < dim - 1) {
+ // Find bin with largest support, test only the dimensions that were not
+ // previously marked.
+ int bestBin = chiSquaredUniformTest(parts, marked, card);
+ if(bestBin < 0) {
+ numuniform++;
+ break; // Uniform
+ }
+ BitsUtil.setI(marked, bestBin);
+ card++;
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
+ }
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
+ }
+
+ ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
+ }
+
+ ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
+ }
+
+ clusterCores = pruneRedundantClusterCores(clusterCores);
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of cluster cores found: " + clusterCores.size());
+ }
+
+ if(clusterCores.size() == 0) {
+ stepProgress.setCompleted(LOG);
+ Clustering<SubspaceModel<V>> c = new Clustering<>("P3C", "P3C");
+ c.addToplevelCluster(new Cluster<SubspaceModel<V>>(relation.getDBIDs(), true));
+ return c;
+ }
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
+ }
+
+ // Track objects not assigned to any cluster:
+ ModifiableDBIDs noise = DBIDUtil.newHashSet();
+ WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
+ int k = clusterCores.size();
+ double[] clusterWeights = new double[k];
+ computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, clusterWeights);
+
+ // Initial estimate of covariances, to assign noise objects
+ Vector[] means = new Vector[k];
+ Matrix[] covarianceMatrices = new Matrix[k], invCovMatr = new Matrix[k];
+ final double norm = MathUtil.powi(MathUtil.TWOPI, dim);
+ double[] normDistrFactor = new double[k];
+ Arrays.fill(normDistrFactor, 1. / Math.sqrt(norm));
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ assignUnassigned(relation, probClusterIGivenX, means, invCovMatr, clusterWeights, noise);
+
+ double emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+ for(int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
+ final double emOld = emNew;
+ EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, means, covarianceMatrices, dim);
+ EM.computeInverseMatrixes(covarianceMatrices, invCovMatr, normDistrFactor, norm);
+ // reassign probabilities
+ emNew = EM.assignProbabilitiesToInstances(relation, normDistrFactor, means, invCovMatr, clusterWeights, probClusterIGivenX);
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("iteration " + it + " - expectation value: " + emNew);
+ }
+ if((emNew - emOld) <= emDelta) {
+ break;
+ }
+ }
+
+ // Perform EM clustering.
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(6, "Generating hard clustering.", LOG);
+ }
+
+ // Create a hard clustering, making sure each data point only is part of one
+ // cluster, based on the best match from the membership matrix.
+ ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
+ }
+
+ // Outlier detection. Remove points from clusters that have a Mahalanobis
+ // distance larger than the critical value of the ChiSquare distribution.
+ findOutliers(relation, means, invCovMatr, clusterCandidates, dim - numuniform, noise);
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(8, "Removing empty clusters.", LOG);
+ }
+
+ // Remove near-empty clusters.
+ for(Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext();) {
+ ClusterCandidate cand = it.next();
+ final int size = cand.ids.size();
+ if(size < minClusterSize) {
+ if(size > 0) {
+ noise.addDBIDs(cand.ids);
+ }
+ it.remove();
+ }
+ }
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
+ }
+
+ // TODO Check all attributes previously deemed uniform (section 3.5).
+
+ if(stepProgress != null) {
+ stepProgress.beginStep(9, "Generating final result.", LOG);
+ }
+
+ // Generate final output.
+ Clustering<SubspaceModel<V>> result = new Clustering<>("P3C", "P3C");
+ for(int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
+ ClusterCandidate candidate = clusterCandidates.get(cluster);
+ CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
+ result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel<>(new Subspace(candidate.dimensions), cvm.getMeanVector(relation))));
+ }
+ LOG.verbose("Noise size: " + noise.size());
+ if(noise.size() > 0) {
+ result.addToplevelCluster(new Cluster<SubspaceModel<V>>(noise, true));
+ }
+
+ if(stepProgress != null) {
+ stepProgress.ensureCompleted(LOG);
+ }
+
+ return result;
+ }
+
+ /**
+ * Construct the 1-signatures by merging adjacent dense bins.
+ *
+ * @param partitions Initial partitions.
+ * @param markers Markers for dense partitions.
+ * @return 1-signatures
+ */
+ private ArrayList<Signature> constructOneSignatures(SetDBIDs[][] partitions, final long[][] markers) {
+ final int dim = partitions.length;
+ // Generate projected p-signature intervals.
+ ArrayList<Signature> signatures = new ArrayList<>();
+ for(int d = 0; d < dim; d++) {
+ final DBIDs[] parts = partitions[d];
+ if(parts == null) {
+ continue; // Never mark any on constant dimensions.
+ }
+ final long[] marked = markers[d];
+ // Find sequences of 1s in marked.
+ for(int start = BitsUtil.nextSetBit(marked, 0); start >= 0;) {
+ int end = BitsUtil.nextClearBit(marked, start + 1);
+ end = (end == -1) ? dim : end;
+ int[] signature = new int[dim << 1];
+ Arrays.fill(signature, -1);
+ signature[d << 1] = start;
+ signature[(d << 1) + 1] = end - 1; // inclusive
+ HashSetModifiableDBIDs sids = unionDBIDs(parts, start, end /* exclusive */);
+ if(LOG.isDebugging()) {
+ LOG.debug("1-signature: " + d + " " + start + "-" + (end - 1));
+ }
+ signatures.add(new Signature(signature, sids));
+ start = (end < dim) ? BitsUtil.nextSetBit(marked, end + 1) : -1;
+ }
+ }
+ return signatures;
+ }
+
+ /**
+ * Merge 1-signatures into p-signatures.
+ *
+ * @param binCount Number of bins in each dimension.
+ * @param signatures 1-signatures
+ * @return p-signatures
+ */
+ private ArrayList<Signature> mergeClusterCores(final int binCount, ArrayList<Signature> signatures) {
+ MutableProgress mergeProgress = LOG.isVerbose() ? new MutableProgress("Merging signatures.", signatures.size(), LOG) : null;
+
+ // Annotate dimensions to 1-signatures for quick stopping.
+ int[] firstdim = new int[signatures.size()];
+ for(int i = 0; i < signatures.size(); i++) {
+ firstdim[i] = signatures.get(i).getFirstDim();
+ }
+ LOG.debug("First dimensions: " + FormatUtil.format(firstdim));
+
+ // Merge to (p+1)-signatures (cluster cores).
+ ArrayList<Signature> clusterCores = new ArrayList<>(signatures);
+ // Try adding merge 1-signature with each cluster core.
+ for(int i = 0; i < clusterCores.size(); i++) {
+ final Signature parent = clusterCores.get(i);
+ final int end = parent.getFirstDim();
+ for(int j = 0; j < signatures.size() && firstdim[j] < end; j++) {
+ final Signature onesig = signatures.get(j);
+ final Signature merge = mergeSignatures(parent, onesig, binCount);
+ if(merge != null) {
+ // We add each potential core to the list to allow remaining
+ // 1-signatures to try merging with this p-signature as well.
+ clusterCores.add(merge);
+ // Flag both "parents" for removal.
+ parent.prune = true;
+ onesig.prune = true;
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setTotal(clusterCores.size());
+ mergeProgress.incrementProcessed(LOG);
+ }
+ }
+ if(mergeProgress != null) {
+ mergeProgress.setProcessed(mergeProgress.getTotal(), LOG);
+ }
+ return clusterCores;
+ }
+
+ private ArrayList<Signature> pruneRedundantClusterCores(ArrayList<Signature> clusterCores) {
+ // Prune cluster cores based on Definition 3, Condition 2.
+ ArrayList<Signature> retain = new ArrayList<>(clusterCores.size());
+ outer: for(Signature clusterCore : clusterCores) {
+ if(clusterCore.prune) {
+ continue;
+ }
+ for(int k = 0; k < clusterCores.size(); k++) {
+ Signature other = clusterCores.get(k);
+ if(other != clusterCore) {
+ if(other.isSuperset(clusterCore)) {
+ continue outer;
+ }
+ }
+ }
+ if(LOG.isDebugging()) {
+ LOG.debug("Retained cluster core: " + clusterCore);
+ }
+ retain.add(clusterCore);
+ }
+ clusterCores = retain;
+ return clusterCores;
+ }
+
+ /**
+ * Partition the data set into {@code bins} bins in each dimension
+ * <i>independently</i>.
+ *
+ * This can be used to construct a grid approximation of the data using O(d n)
+ * memory.
+ *
+ * When a dimension is found to be constant, it will not be partitioned, but
+ * instead the corresponding array will be set to {@code null}.
+ *
+ * @param relation Data relation to partition
+ * @param bins Number of bins
+ * @return Partitions of each dimension.
+ */
+ private SetDBIDs[][] partitionData(final Relation<V> relation, final int bins) {
+ final int dim = RelationUtil.dimensionality(relation);
+ SetDBIDs[][] partitions = new SetDBIDs[dim][bins];
+ ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
+ DBIDArrayIter iter = ids.iter(); // will be reused.
+ SortDBIDsBySingleDimension sorter = new VectorUtil.SortDBIDsBySingleDimension(relation, 0);
+ for(int d = 0; d < dim; d++) {
+ sorter.setDimension(d);
+ ids.sort(sorter);
+ // Minimum:
+ iter.seek(0);
+ double min = relation.get(iter).doubleValue(d);
+ // Extend:
+ iter.seek(ids.size() - 1);
+ double delta = (relation.get(iter).doubleValue(d) - min) / bins;
+ if(delta > 0.) {
+ SetDBIDs[] dimparts = partitions[d];
+ double split = min + delta;
+ HashSetModifiableDBIDs pids = DBIDUtil.newHashSet();
+ dimparts[0] = pids;
+ int i = 0;
+ for(iter.seek(0); iter.valid(); iter.advance()) {
+ final double v = relation.get(iter).doubleValue(d);
+ if(v <= split || i == dimparts.length - 1) {
+ pids.add(iter);
+ }
+ else {
+ i++;
+ split += delta;
+ pids = DBIDUtil.newHashSet();
+ dimparts[i] = pids;
+ }
+ }
+ for(++i; i < dimparts.length; ++i) {
+ dimparts[i] = pids;
+ }
+ }
+ else {
+ partitions[d] = null; // Flag whole dimension as bad
+ }
+ }
+ return partitions;
+ }
+
+ /**
+ * Compute the union of multiple DBID sets.
+ *
+ * @param parts Parts array
+ * @param start Array start index
+ * @param end Array end index (exclusive)
+ * @return
+ */
+ protected HashSetModifiableDBIDs unionDBIDs(final DBIDs[] parts, int start, int end) {
+ int sum = 0;
+ for(int i = start; i < end; i++) {
+ sum += parts[i].size();
+ }
+ HashSetModifiableDBIDs sids = DBIDUtil.newHashSet(sum);
+ for(int i = start; i < end; i++) {
+ sids.addDBIDs(parts[i]);
+ }
+ return sids;
+ }
+
+ /**
+ * Performs a ChiSquared test to determine whether an attribute has a uniform
+ * distribution.
+ *
+ * @param parts Data partitions.
+ * @param marked the marked bins that should be ignored.
+ * @param card Cardinality
+ * @return Position of maximum, or -1 when uniform.
+ */
+ private int chiSquaredUniformTest(SetDBIDs[] parts, long[] marked, int card) {
+ // Remaining number of bins.
+ final int binCount = parts.length - card;
+ // Get global mean over all unmarked bins.
+ int max = 0, maxpos = -1;
+ MeanVariance mv = new MeanVariance();
+ for(int i = 0; i < parts.length; i++) {
+ // Ignore already marked bins.
+ if(BitsUtil.get(marked, i)) {
+ continue;
+ }
+ final int binSupport = parts[i].size();
+ mv.put(binSupport);
+ if(binSupport > max) {
+ max = binSupport;
+ maxpos = i;
+ }
+ }
+ if(mv.getCount() < 1. || !(mv.getNaiveVariance() > 0.)) {
+ return -1;
+ }
+ // ChiSquare statistic is the naive variance of the sizes!
+ final double chiSquare = mv.getNaiveVariance() / mv.getMean();
+ final double test = ChiSquaredDistribution.cdf(chiSquare, Math.max(1, binCount - card - 1));
+ if((1. - alpha) < test) {
+ return maxpos;
+ }
+ return -1;
+ }
+
+ /**
+ * Computes a fuzzy membership with the weights based on which cluster cores
+ * each data point is part of.
+ *
+ * @param relation Data relation
+ * @param clusterCores the cluster cores.
+ * @param unassigned set to which to add unassigned points.
+ * @param probClusterIGivenX Membership probabilities.
+ * @param clusterWeights Cluster weights
+ */
+ private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, double[] clusterWeights) {
+ final int n = relation.size();
+ final int k = clusterCores.size();
+
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ int count = 0;
+ double[] weights = new double[k];
+ for(int cluster = 0; cluster < k; ++cluster) {
+ if(clusterCores.get(cluster).ids.contains(iter)) {
+ weights[cluster] = 1.;
+ ++count;
+ }
+ }
+
+ // Set value(s) in membership matrix.
+ if(count > 0) {
+ // Rescale.
+ VMath.timesEquals(weights, 1. / count);
+ VMath.plusTimesEquals(clusterWeights, weights, 1. / n);
+ }
+ else {
+ // Does not match any cluster, mark it.
+ unassigned.add(iter);
+ }
+ probClusterIGivenX.put(iter, weights);
+ }
+ }
+
+ /**
+ * Assign unassigned objects to best candidate based on shortest Mahalanobis
+ * distance.
+ *
+ * @param relation Data relation
+ * @param probClusterIGivenX fuzzy membership matrix.
+ * @param means Cluster means.
+ * @param invCovMatr Cluster covariance matrices.
+ * @param clusterWeights
+ * @param assigned mapping of matrix row to DBID.
+ * @param unassigned the list of points not yet assigned.
+ */
+ private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, Vector[] means, Matrix[] invCovMatr, double[] clusterWeights, ModifiableDBIDs unassigned) {
+ if(unassigned.size() == 0) {
+ return;
+ }
+ final int k = means.length;
+ double pweight = 1. / relation.size();
+
+ for(DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
+ // Find the best matching known cluster core using the Mahalanobis
+ // distance.
+ Vector v = relation.get(iter).getColumnVector();
+ int bestCluster = -1;
+ double minDistance = Double.POSITIVE_INFINITY;
+ for(int c = 0; c < k; ++c) {
+ final double distance = MathUtil.mahalanobisDistance(invCovMatr[c], v.minus(means[c]));
+ if(distance < minDistance) {
+ minDistance = distance;
+ bestCluster = c;
+ }
+ }
+ // Assign to best core.
+ double[] weights = new double[k];
+ weights[bestCluster] = 1.0;
+ clusterWeights[bestCluster] += pweight;
+ probClusterIGivenX.put(iter, weights);
+ }
+
+ // Clear the list of unassigned objects.
+ unassigned.clear();
+ }
+
+ /**
+ * Creates a hard clustering from the specified soft membership matrix.
+ *
+ * @param probClusterIGivenX the membership matrix.
+ * @param dbids mapping matrix row to DBID.
+ * @return a hard clustering based on the matrix.
+ */
+ private ArrayList<ClusterCandidate> hardClustering(WritableDataStore<double[]> probClusterIGivenX, List<Signature> clusterCores, DBIDs dbids) {
+ final int k = clusterCores.size();
+
+ // Initialize cluster sets.
+ ArrayList<ClusterCandidate> candidates = new ArrayList<>();
+ for(Signature sig : clusterCores) {
+ candidates.add(new ClusterCandidate(sig));
+ }
+
+ // Perform hard partitioning, assigning each data point only to one cluster,
+ // namely that one it is most likely to belong to.
+ for(DBIDIter iter = dbids.iter(); iter.valid(); iter.advance()) {
+ final double[] probs = probClusterIGivenX.get(iter);
+ int bestCluster = 0;
+ double bestProbability = probs[0];
+ for(int c = 1; c < k; ++c) {
+ if(probs[c] > bestProbability) {
+ bestCluster = c;
+ bestProbability = probs[c];
+ }
+ }
+ candidates.get(bestCluster).ids.add(iter);
+ }
+
+ return candidates;
+ }
+
+ /**
+ * Performs outlier detection by testing the Mahalanobis distance of each
+ * point in a cluster against the critical value of the ChiSquared
+ * distribution with as many degrees of freedom as the cluster has relevant
+ * attributes.
+ *
+ * @param relation Data relation
+ * @param means Cluster means
+ * @param invCovMatr Inverse covariance matrixes
+ * @param clusterCandidates the list of clusters to check.
+ * @param nonUniformDimensionCount the number of dimensions to consider when
+ * testing.
+ * @param noise the set to which to add points deemed outliers.
+ */
+ private void findOutliers(Relation<V> relation, Vector[] means, Matrix[] invCovMatr, ArrayList<ClusterCandidate> clusterCandidates, int nonUniformDimensionCount, ModifiableDBIDs noise) {
+ final int k = clusterCandidates.size();
+
+ for(int c = 0; c < k; ++c) {
+ final ClusterCandidate candidate = clusterCandidates.get(c);
+ if(candidate.ids.size() < 2) {
+ continue;
+ }
+ final int dof = candidate.dimensions.cardinality();
+ final double threshold = ChiSquaredDistribution.quantile(1 - .001, dof);
+ for(DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
+ final Vector mean = means[c];
+ final Vector delta = relation.get(iter).getColumnVector().minusEquals(mean);
+ final Matrix invCov = invCovMatr[c];
+ final double distance = MathUtil.mahalanobisDistance(invCov, delta);
+ if(distance >= threshold) {
+ // Outlier, remove it and add it to the outlier set.
+ noise.add(iter);
+ iter.remove();
+ }
+ }
+ }
+ }
+
+ /**
+ * Generates a merged signature of this and another one, where the other
+ * signature must be a 1-signature.
+ *
+ * @param first First signature.
+ * @param second Second signature, must be a 1-signature.
+ * @param numBins Number of bins per dimension.
+ * @return the merged signature, or null if the merge failed.
+ */
+ protected Signature mergeSignatures(Signature first, Signature second, int numBins) {
+ int d2 = -1;
+ for(int i = 0; i < second.spec.length; i += 2) {
+ if(second.spec[i] >= 0) {
+ assert (d2 == -1) : "Merging with non-1-signature?!?";
+ d2 = i;
+ }
+ }
+ assert (d2 >= 0) : "Merging with empty signature?";
+
+ // Avoid generating redundant signatures.
+ if(first.spec[d2] >= 0) {
+ return null;
+ }
+
+ // Definition 3, Condition 1:
+ // True support:
+ final ModifiableDBIDs intersection = DBIDUtil.intersection(first.ids, second.ids);
+ final int support = intersection.size();
+ // Interval width, computed using selected number of bins / total bins
+ double width = (second.spec[d2 + 1] - second.spec[d2] + 1.) / (double) numBins;
+ // Expected size thus:
+ double expect = first.ids.size() * width;
+ if(support <= expect || support < minClusterSize) {
+ return null;
+ }
+ final double test = PoissonDistribution.rawProbability(support, expect);
+ if((poissonThreshold) <= test) {
+ return null;
+ }
+ // Create merged signature.
+ int[] spec = first.spec.clone();
+ spec[d2] = second.spec[d2];
+ spec[d2 + 1] = second.spec[d2];
+
+ final Signature newsig = new Signature(spec, intersection);
+ if(LOG.isDebugging()) {
+ LOG.debug(newsig.toString());
+ }
+ return newsig;
+ }
+
+ /**
+ * P3C Cluster signature.
+ *
+ * @author Erich Schubert
+ */
+ private static class Signature {
+ /**
+ * Subspace specification
+ */
+ int[] spec;
+
+ /**
+ * Object ids.
+ */
+ DBIDs ids;
+
+ /**
+ * Pruning flag.
+ */
+ boolean prune = false;
+
+ /**
+ * Constructor.
+ *
+ * @param spec Subspace specification
+ * @param ids IDs.
+ */
+ private Signature(int[] spec, DBIDs ids) {
+ super();
+ this.spec = spec;
+ this.ids = ids;
+ }
+
+ /**
+ * Test whether this is a superset of the other signature.
+ *
+ * @param other Other signature.
+ * @return {@code true} when this is a superset.
+ */
+ public boolean isSuperset(Signature other) {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] != other.spec[i] || spec[i + 1] != other.spec[i]) {
+ if(other.spec[i] != -1) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Find the first dimension set in this signature.
+ *
+ * @return Dimension
+ */
+ public int getFirstDim() {
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ return (i >>> 1);
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public String toString() {
+ int p = 0;
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ p++;
+ }
+ }
+ StringBuilder buf = new StringBuilder();
+ buf.append(p).append("-signature: ");
+ for(int i = 0; i < spec.length; i += 2) {
+ if(spec[i] >= 0) {
+ buf.append(i >>> 1).append(':');
+ buf.append(spec[i]).append('-').append(spec[i + 1]).append(' ');
+ }
+ }
+ buf.append(" size: ").append(ids.size());
+ return buf.toString();
+ }
+ }
+
+ /**
+ * This class is used to represent potential clusters.
+ *
+ * @author Erich Schubert
+ */
+ private static class ClusterCandidate {
+ /**
+ * Selected dimensions
+ */
+ public final BitSet dimensions;
+
+ /**
+ * Objects contained in cluster.
+ */
+ public final ModifiableDBIDs ids;
+
+ /**
+ * Constructor.
+ *
+ * @param clusterCore Signature
+ */
+ public ClusterCandidate(Signature clusterCore) {
+ this.dimensions = new BitSet(clusterCore.spec.length >> 1);
+ for(int i = 0; i < clusterCore.spec.length; i += 2) {
+ this.dimensions.set(i >> 1);
+ }
+ this.ids = DBIDUtil.newArray(clusterCore.ids.size());
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Florian Nuecke
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
+ /**
+ * Parameter for the chi squared test threshold.
+ */
+ public static final OptionID ALPHA_THRESHOLD_ID = new OptionID("p3c.alpha", "The significance level for uniform testing in the initial binning step.");
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ public static final OptionID POISSON_THRESHOLD_ID = new OptionID("p3c.threshold", "The threshold value for the poisson test used when merging signatures.");
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ public static final OptionID MAX_EM_ITERATIONS_ID = new OptionID("p3c.em.maxiter", "The maximum number of iterations for the EM step. Use -1 to run until delta convergence.");
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ public static final OptionID EM_DELTA_ID = new OptionID("p3c.em.delta", "The change delta for the EM step below which to stop.");
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ public static final OptionID MIN_CLUSTER_SIZE_ID = new OptionID("p3c.minsize", "The minimum size of a cluster, otherwise it is seen as noise (this is a cheat, it is not mentioned in the paper).");
+
+ /**
+ * Parameter for the chi squared test threshold.
+ *
+ * While statistical values such as 0.01 are a good choice, we found the
+ * need to modify this parameter in our experiments.
+ */
+ protected double alpha;
+
+ /**
+ * Parameter for the poisson test threshold.
+ */
+ protected double poissonThreshold;
+
+ /**
+ * Maximum number of iterations for the EM step.
+ */
+ protected int maxEmIterations;
+
+ /**
+ * Threshold when to stop EM iterations.
+ */
+ protected double emDelta;
+
+ /**
+ * Minimum cluster size for noise flagging. (Not existant in the original
+ * publication).
+ */
+ protected int minClusterSize;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+
+ {
+ DoubleParameter param = new DoubleParameter(ALPHA_THRESHOLD_ID, .001);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ alpha = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(POISSON_THRESHOLD_ID, 1.e-4);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ param.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
+ if(config.grab(param)) {
+ poissonThreshold = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MAX_EM_ITERATIONS_ID, 20);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_MINUSONE_INT);
+ if(config.grab(param)) {
+ maxEmIterations = param.getValue();
+ }
+ }
+
+ {
+ DoubleParameter param = new DoubleParameter(EM_DELTA_ID, 1.e-5);
+ param.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(param)) {
+ emDelta = param.getValue();
+ }
+ }
+
+ {
+ IntParameter param = new IntParameter(MIN_CLUSTER_SIZE_ID, 1);
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(param)) {
+ minClusterSize = param.getValue();
+ }
+ }
+ }
+
+ @Override
+ protected P3C<V> makeInstance() {
+ return new P3C<>(alpha, poissonThreshold, maxEmIterations, emDelta, minClusterSize);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
index 92158734..03e9978f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/PROCLUS.java
@@ -67,7 +67,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -148,7 +148,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) {
DistanceQuery<V, DoubleDistance> distFunc = this.getDistanceQuery(database);
RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
- final Random random = rnd.getRandom();
+ final Random random = rnd.getSingleThreadedRandom();
if (RelationUtil.dimensionality(relation) < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! " + "(" + RelationUtil.dimensionality(relation) + " < " + l + ")");
@@ -844,7 +844,7 @@ public class PROCLUS<V extends NumberVector<?>> extends AbstractProjectedCluster
configL(config);
IntParameter m_iP = new IntParameter(M_I_ID, 10);
- m_iP.addConstraint(new GreaterConstraint(0));
+ m_iP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(m_iP)) {
m_i = m_iP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
index c8d0833e..e6245f6e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/subspace/SUBCLU.java
@@ -54,7 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -77,7 +77,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Elke Achtert
*
* @apiviz.uses DBSCAN
- * @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction
+ * @apiviz.uses DimensionSelectingSubspaceDistanceFunction
* @apiviz.has SubspaceModel
*
* @param <V> the type of FeatureVector handled by this Algorithm
@@ -488,7 +488,7 @@ public class SUBCLU<V extends NumberVector<?>> extends AbstractAlgorithm<Cluster
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
- minptsP.addConstraint(new GreaterConstraint(0));
+ minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(minptsP)) {
minpts = minptsP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java
index ad0b8175..65447713 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ABOD.java
@@ -23,59 +23,45 @@ package de.lmu.ifi.dbs.elki.algorithm.outlier;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.HashMap;
-
-import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.QueryUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
-import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
-import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
-import de.lmu.ifi.dbs.elki.distance.similarityfunction.PrimitiveSimilarityFunction;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.PolynomialKernelFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
-import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
-import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap;
-import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
- * Angle-Based Outlier Detection
+ * Angle-Based Outlier Detection / Angle-Based Outlier Factor.
*
* Outlier detection using variance analysis on angles, especially for high
- * dimensional data sets.
+ * dimensional data sets. Exact version, which has cubic runtime (see also
+ * {@link FastABOD} and {@link LBABOD} for faster versions).
*
* H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in
* High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge
@@ -84,475 +70,107 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @author Matthias Schubert (Original Code)
* @author Erich Schubert (ELKIfication)
*
- * @apiviz.has KNNQuery
- *
* @param <V> Vector type
*/
@Title("ABOD: Angle-Based Outlier Detection")
@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.")
@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946")
-public class ABOD<V extends NumberVector<?>> extends AbstractDistanceBasedAlgorithm<V, DoubleDistance, OutlierResult> implements OutlierAlgorithm {
+public class ABOD<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(ABOD.class);
/**
- * Parameter for k, the number of neighbors used in kNN queries.
- */
- public static final OptionID K_ID = new OptionID("abod.k", "Parameter k for kNN queries.");
-
- /**
- * Parameter for sample size to be used in fast mode.
- */
- public static final OptionID FAST_SAMPLE_ID = new OptionID("abod.samplesize", "Sample size to enable fast mode.");
-
- /**
- * Parameter for the kernel function.
- */
- public static final OptionID KERNEL_FUNCTION_ID = new OptionID("abod.kernelfunction", "Kernel function to use.");
-
- /**
- * The preprocessor used to materialize the kNN neighborhoods.
- */
- public static final OptionID PREPROCESSOR_ID = new OptionID("abod.knnquery", "Processor to compute the kNN neighborhoods.");
-
- /**
- * use alternate code below.
- */
- private static final boolean USE_RND_SAMPLE = false;
-
- /**
- * k parameter.
- */
- private int k;
-
- /**
- * Variable to store fast mode sampling value.
- */
- int sampleSize = 0;
-
- /**
* Store the configured Kernel version.
*/
- private PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction;
-
- /**
- * Static DBID map.
- */
- private ArrayDBIDs staticids = null;
+ protected SimilarityFunction<? super V, DoubleDistance> kernelFunction;
/**
- * Actual constructor, with parameters. Fast mode (sampling).
+ * Constructor for Angle-Based Outlier Detection (ABOD).
*
- * @param k k parameter
- * @param sampleSize sample size
- * @param primitiveKernelFunction Kernel function to use
- * @param distanceFunction Distance function
+ * @param kernelFunction kernel function to use
*/
- public ABOD(int k, int sampleSize, PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction, DistanceFunction<V, DoubleDistance> distanceFunction) {
- super(distanceFunction);
- this.k = k;
- this.sampleSize = sampleSize;
- this.primitiveKernelFunction = primitiveKernelFunction;
+ public ABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction) {
+ super();
+ this.kernelFunction = kernelFunction;
}
/**
- * Actual constructor, with parameters. Slow mode (exact).
+ * Run ABOD on the data set.
*
- * @param k k parameter
- * @param primitiveKernelFunction kernel function to use
- * @param distanceFunction Distance function
+ * @param relation Relation to process
+ * @return Outlier detection result
*/
- public ABOD(int k, PrimitiveSimilarityFunction<? super V, DoubleDistance> primitiveKernelFunction, DistanceFunction<V, DoubleDistance> distanceFunction) {
- super(distanceFunction);
- this.k = k;
- this.sampleSize = 0;
- this.primitiveKernelFunction = primitiveKernelFunction;
- }
+ public OutlierResult run(Database db, Relation<V> relation) {
+ DBIDs ids = relation.getDBIDs();
+ // Build a kernel matrix, to make O(n^3) slightly less bad.
+ SimilarityQuery<V, DoubleDistance> sq = db.getSimilarityQuery(relation, kernelFunction);
+ KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
- /**
- * Main part of the algorithm. Exact version.
- *
- * @param relation Relation to query
- * @return result
- */
- public OutlierResult getRanking(Relation<V> relation) {
- // Fix a static set of IDs
- if (relation.getDBIDs() instanceof DBIDRange) {
- staticids = (DBIDRange) relation.getDBIDs();
- } else {
- staticids = DBIDUtil.newArray(relation.getDBIDs());
- ((ArrayModifiableDBIDs) staticids).sort();
- }
-
- KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids);
- ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(relation.size());
-
- // preprocess kNN neighborhoods
- KNNQuery<V, DoubleDistance> knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k);
+ WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
+ DoubleMinMax minmaxabod = new DoubleMinMax();
MeanVariance s = new MeanVariance();
- for (DBIDIter objKey = relation.iterDBIDs(); objKey.valid(); objKey.advance()) {
- s.reset();
-
- KNNList<DoubleDistance> neighbors = knnQuery.getKNNForDBID(objKey, k);
- for (DBIDIter key1 = neighbors.iter(); key1.valid(); key1.advance()) {
- for (DBIDIter key2 = neighbors.iter(); key2.valid(); key2.advance()) {
- if (DBIDUtil.equal(key2, key1) || DBIDUtil.equal(key1, objKey) || DBIDUtil.equal(key2, objKey)) {
- continue;
- }
- double nenner = calcDenominator(kernelMatrix, objKey, key1, key2);
-
- if (nenner != 0) {
- double sqrtnenner = Math.sqrt(nenner);
- double tmp = calcNumerator(kernelMatrix, objKey, key1, key2) / nenner;
- s.put(tmp, 1 / sqrtnenner);
- }
-
- }
- }
- // Sample variance probably would be correct, however the numerical
- // instabilities can actually break ABOD here.
- pq.add(DBIDUtil.newPair(s.getNaiveVariance(), objKey));
+ for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) {
+ final double abof = computeABOF(relation, kernelMatrix, pA, s);
+ minmaxabod.put(abof);
+ abodvalues.putDouble(pA, abof);
}
- DoubleMinMax minmaxabod = new DoubleMinMax();
- WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
- while (!pq.isEmpty()) {
- DoubleDBIDPair pair = pq.poll();
- abodvalues.putDouble(pair, pair.doubleValue());
- minmaxabod.put(pair.doubleValue());
- }
// Build result representation.
- Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs());
+ Relation<Double> scoreResult = new MaterializedRelation<>("Angle-Based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
return new OutlierResult(scoreMeta, scoreResult);
}
/**
- * Main part of the algorithm. Fast version.
+ * Compute the exact ABOF value.
*
- * @param relation Relation to use
- * @return result
- */
- public OutlierResult getFastRanking(Relation<V> relation) {
- final DBIDs ids = relation.getDBIDs();
- // Fix a static set of IDs
- // TODO: add a DBIDUtil.ensureSorted?
- if (relation.getDBIDs() instanceof DBIDRange) {
- staticids = (DBIDRange) relation.getDBIDs();
- } else {
- staticids = DBIDUtil.newArray(relation.getDBIDs());
- ((ArrayModifiableDBIDs) staticids).sort();
- }
-
- KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, relation, staticids);
-
- ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(relation.size());
- // get Candidate Ranking
- for (DBIDIter aKey = relation.iterDBIDs(); aKey.valid(); aKey.advance()) {
- WritableDoubleDataStore dists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
- // determine kNearestNeighbors and pairwise distances
- ComparableMinHeap<DoubleDBIDPair> nn;
- if (!USE_RND_SAMPLE) {
- nn = calcDistsandNN(relation, kernelMatrix, sampleSize, aKey, dists);
- } else {
- // alternative:
- nn = calcDistsandRNDSample(relation, kernelMatrix, sampleSize, aKey, dists);
- }
-
- // get normalization
- double[] counter = calcFastNormalization(aKey, dists, staticids);
- // umsetzen von Pq zu list
- ModifiableDBIDs neighbors = DBIDUtil.newArray(nn.size());
- while (!nn.isEmpty()) {
- neighbors.add(nn.poll());
- }
- // getFilter
- double var = getAbofFilter(kernelMatrix, aKey, dists, counter[1], counter[0], neighbors);
- pq.add(DBIDUtil.newPair(var, aKey));
- }
- // refine Candidates
- ComparableMinHeap<DoubleDBIDPair> resqueue = new ComparableMinHeap<>(k);
- MeanVariance s = new MeanVariance();
- while (!pq.isEmpty()) {
- if (resqueue.size() == k && pq.peek().doubleValue() > resqueue.peek().doubleValue()) {
- break;
- }
- // double approx = pq.peek().getFirst();
- DBIDRef aKey = pq.poll();
- s.reset();
- for (DBIDIter bKey = relation.iterDBIDs(); bKey.valid(); bKey.advance()) {
- if (DBIDUtil.equal(bKey, aKey)) {
- continue;
- }
- for (DBIDIter cKey = relation.iterDBIDs(); cKey.valid(); cKey.advance()) {
- if (DBIDUtil.equal(cKey, aKey)) {
- continue;
- }
- // double nenner = dists[y]*dists[z];
- double nenner = calcDenominator(kernelMatrix, aKey, bKey, cKey);
- if (nenner != 0) {
- double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner;
- double sqrtNenner = Math.sqrt(nenner);
- s.put(tmp, 1 / sqrtNenner);
- }
- }
- }
- double var = s.getSampleVariance();
- if (resqueue.size() < k) {
- resqueue.add(DBIDUtil.newPair(var, aKey));
- } else {
- if (resqueue.peek().doubleValue() > var) {
- resqueue.replaceTopElement(DBIDUtil.newPair(var, aKey));
- }
- }
-
- }
- DoubleMinMax minmaxabod = new DoubleMinMax();
- WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
- while (!pq.isEmpty()) {
- DoubleDBIDPair pair = pq.poll();
- abodvalues.putDouble(pair, pair.doubleValue());
- minmaxabod.put(pair.doubleValue());
- }
- // Build result representation.
- Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids);
- OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
- return new OutlierResult(scoreMeta, scoreResult);
- }
-
- private double[] calcFastNormalization(DBIDRef x, WritableDoubleDataStore dists, DBIDs ids) {
- double[] result = new double[2];
-
- double sum = 0;
- double sumF = 0;
- for (DBIDIter yKey = ids.iter(); yKey.valid(); yKey.advance()) {
- if (dists.doubleValue(yKey) != 0) {
- double tmp = 1 / Math.sqrt(dists.doubleValue(yKey));
- sum += tmp;
- sumF += (1 / dists.doubleValue(yKey)) * tmp;
- }
- }
- double sofar = 0;
- double sofarF = 0;
- for (DBIDIter zKey = ids.iter(); zKey.valid(); zKey.advance()) {
- if (dists.doubleValue(zKey) != 0) {
- double tmp = 1 / Math.sqrt(dists.doubleValue(zKey));
- sofar += tmp;
- double rest = sum - sofar;
- result[0] += tmp * rest;
-
- sofarF += (1 / dists.doubleValue(zKey)) * tmp;
- double restF = sumF - sofarF;
- result[1] += (1 / dists.doubleValue(zKey)) * tmp * restF;
- }
- }
- return result;
- }
-
- private double getAbofFilter(KernelMatrix kernelMatrix, DBIDRef aKey, WritableDoubleDataStore dists, double fulCounter, double counter, DBIDs neighbors) {
- double sum = 0.0;
- double sqrSum = 0.0;
- double partCounter = 0;
- for (DBIDIter bKey = neighbors.iter(); bKey.valid(); bKey.advance()) {
- if (DBIDUtil.equal(bKey, aKey)) {
+ * @param relation Relation
+ * @param kernelMatrix Kernel matrix
+ * @param pA Object A to compute ABOF for
+ * @param s Statistics tracker
+ * @return ABOF value
+ */
+ protected double computeABOF(Relation<V> relation, KernelMatrix kernelMatrix, DBIDRef pA, MeanVariance s) {
+ s.reset(); // Reused
+ double simAA = kernelMatrix.getSimilarity(pA, pA);
+
+ for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
+ if (DBIDUtil.equal(nB, pA)) {
continue;
}
- for (DBIDIter cKey = neighbors.iter(); cKey.valid(); cKey.advance()) {
- if (DBIDUtil.equal(cKey, aKey)) {
- continue;
- }
- if (DBIDUtil.compare(bKey, cKey) > 0) {
- double nenner = dists.doubleValue(bKey) * dists.doubleValue(cKey);
- if (nenner != 0) {
- double tmp = calcNumerator(kernelMatrix, aKey, bKey, cKey) / nenner;
- double sqrtNenner = Math.sqrt(nenner);
- sum += tmp * (1 / sqrtNenner);
- sqrSum += tmp * tmp * (1 / sqrtNenner);
- partCounter += (1 / (sqrtNenner * nenner));
- }
- }
- }
- }
- // TODO: Document the meaning / use of fulCounter, partCounter.
- double mu = (sum + (fulCounter - partCounter)) / counter;
- return (sqrSum / counter) - (mu * mu);
- }
-
- /**
- * Compute the cosinus value between vectors aKey and bKey.
- *
- * @param kernelMatrix
- * @param aKey
- * @param bKey
- * @return cosinus value
- */
- private double calcCos(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey) {
- final int ai = mapDBID(aKey);
- final int bi = mapDBID(bKey);
- return kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, bi) - 2 * kernelMatrix.getDistance(ai, bi);
- }
-
- private int mapDBID(DBIDRef aKey) {
- // TODO: this is not the most efficient...
- int off = staticids.binarySearch(aKey);
- if (off < 0) {
- throw new AbortException("Did not find id " + aKey.toString() + " in staticids. " + staticids.contains(aKey));
- }
- return off + 1;
- }
-
- private double calcDenominator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) {
- return calcCos(kernelMatrix, aKey, bKey) * calcCos(kernelMatrix, aKey, cKey);
- }
-
- private double calcNumerator(KernelMatrix kernelMatrix, DBIDRef aKey, DBIDRef bKey, DBIDRef cKey) {
- final int ai = mapDBID(aKey);
- final int bi = mapDBID(bKey);
- final int ci = mapDBID(cKey);
- return (kernelMatrix.getDistance(ai, ai) + kernelMatrix.getDistance(bi, ci) - kernelMatrix.getDistance(ai, ci) - kernelMatrix.getDistance(ai, bi));
- }
-
- private ComparableMinHeap<DoubleDBIDPair> calcDistsandNN(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) {
- ComparableMinHeap<DoubleDBIDPair> nn = new ComparableMinHeap<>(sampleSize);
- for (DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) {
- double val = calcCos(kernelMatrix, aKey, bKey);
- dists.putDouble(bKey, val);
- if (nn.size() < sampleSize) {
- nn.add(DBIDUtil.newPair(val, bKey));
- } else {
- if (val < nn.peek().doubleValue()) {
- nn.replaceTopElement(DBIDUtil.newPair(val, bKey));
- }
- }
- }
- return nn;
- }
-
- private ComparableMinHeap<DoubleDBIDPair> calcDistsandRNDSample(Relation<V> data, KernelMatrix kernelMatrix, int sampleSize, DBIDRef aKey, WritableDoubleDataStore dists) {
- ComparableMinHeap<DoubleDBIDPair> nn = new ComparableMinHeap<>(sampleSize);
- int step = (int) ((double) data.size() / (double) sampleSize);
- int counter = 0;
- for (DBIDIter bKey = data.iterDBIDs(); bKey.valid(); bKey.advance()) {
- double val = calcCos(kernelMatrix, aKey, bKey);
- dists.putDouble(bKey, val);
- if (counter % step == 0) {
- nn.add(DBIDUtil.newPair(val, bKey));
+ double simBB = kernelMatrix.getSimilarity(nB, nB);
+ double simAB = kernelMatrix.getSimilarity(pA, nB);
+ double sqdAB = simAA + simBB - simAB - simAB;
+ if (!(sqdAB > 0.)) {
+ continue;
}
- counter++;
- }
- return nn;
- }
-
- /**
- * Get explanations for points in the database.
- *
- * @param data to get explanations for
- * @return String explanation
- */
- // TODO: this should be done by the result classes.
- public String getExplanations(Relation<V> data) {
- KernelMatrix kernelMatrix = new KernelMatrix(primitiveKernelFunction, data, staticids);
- // PQ for Outlier Ranking
- ComparableMaxHeap<DoubleDBIDPair> pq = new ComparableMaxHeap<>(data.size());
- HashMap<DBID, DBIDs> explaintab = new HashMap<>();
- // test all objects
- MeanVariance s = new MeanVariance(), s2 = new MeanVariance();
- for (DBIDIter objKey = data.iterDBIDs(); objKey.valid(); objKey.advance()) {
- s.reset();
- // Queue for the best explanation
- ComparableMinHeap<DoubleDBIDPair> explain = new ComparableMinHeap<>();
- // determine Object
- // for each pair of other objects
- for (DBIDIter key1 = data.iterDBIDs(); key1.valid(); key1.advance()) {
- // Collect Explanation Vectors
- s2.reset();
- if (DBIDUtil.equal(objKey, key1)) {
+ for (DBIDIter nC = relation.iterDBIDs(); nC.valid(); nC.advance()) {
+ if (DBIDUtil.equal(nC, pA) || DBIDUtil.compare(nC, nB) < 0) {
continue;
}
- for (DBIDIter key2 = data.iterDBIDs(); key2.valid(); key2.advance()) {
- if (DBIDUtil.equal(key2, key1) || DBIDUtil.equal(objKey, key2)) {
- continue;
- }
- double nenner = calcDenominator(kernelMatrix, objKey, key1, key2);
- if (nenner != 0) {
- double tmp = calcNumerator(kernelMatrix, objKey, key1, key2) / nenner;
- double sqr = Math.sqrt(nenner);
- s2.put(tmp, 1 / sqr);
- }
- }
- explain.add(DBIDUtil.newPair(s2.getSampleVariance(), key1));
- s.put(s2);
- }
- // build variance of the observed vectors
- pq.add(DBIDUtil.newPair(s.getSampleVariance(), objKey));
- //
- ModifiableDBIDs expList = DBIDUtil.newArray();
- expList.add(explain.poll());
- while (!explain.isEmpty()) {
- DBIDRef nextKey = explain.poll();
- if (DBIDUtil.equal(nextKey, objKey)) {
+ double simCC = kernelMatrix.getSimilarity(nC, nC);
+ double simAC = kernelMatrix.getSimilarity(pA, nC);
+ double sqdAC = simAA + simCC - simAC;
+ if (!(sqdAC > 0.)) {
continue;
}
- double max = Double.MIN_VALUE;
- for (DBIDIter exp = expList.iter(); exp.valid(); exp.advance()) {
- if (DBIDUtil.equal(exp, objKey) || DBIDUtil.equal(nextKey, exp)) {
- continue;
- }
- double nenner = Math.sqrt(calcCos(kernelMatrix, objKey, nextKey)) * Math.sqrt(calcCos(kernelMatrix, objKey, exp));
- double angle = calcNumerator(kernelMatrix, objKey, nextKey, exp) / nenner;
- max = Math.max(angle, max);
- }
- if (max < 0.5) {
- expList.add(nextKey);
- }
- }
- explaintab.put(DBIDUtil.deref(objKey), expList);
- }
- StringBuilder buf = new StringBuilder();
- buf.append("Result: ABOD\n");
- int count = 0;
- while (!pq.isEmpty()) {
- if (count > 10) {
- break;
+ // Exploit bilinearity of scalar product:
+ // <B-A, C-A> = <B, C-A> - <A,C-A>
+ // = <B,C> - <B,A> - <A,C> + <A,A>
+ // For computing variance, AA is a constant and can be ignored.
+ double simBC = kernelMatrix.getSimilarity(nB, nC);
+ double numerator = simBC - simAB - simAC; // + simAA;
+ double val = numerator / (sqdAB * sqdAC);
+ s.put(val, 1. / Math.sqrt(sqdAB * sqdAC));
}
- double factor = pq.peek().doubleValue();
- DBIDRef key = pq.poll();
- buf.append(data.get(key)).append(' ');
- buf.append(count).append(" Factor=").append(factor).append(' ').append(key).append('\n');
- DBIDs expList = explaintab.get(key);
- generateExplanation(buf, data, key, expList);
- count++;
- }
- return buf.toString();
- }
-
- private void generateExplanation(StringBuilder buf, Relation<V> data, DBIDRef key, DBIDs expList) {
- Vector vect1 = data.get(key).getColumnVector();
- for (DBIDIter iter = expList.iter(); iter.valid(); iter.advance()) {
- buf.append("Outlier: ").append(vect1).append('\n');
- Vector exp = data.get(iter).getColumnVector();
- buf.append("Most common neighbor: ").append(exp).append('\n');
- // determine difference Vector
- Vector vals = exp.minus(vect1);
- buf.append(vals).append('\n');
- }
- }
-
- /**
- * Run ABOD on the data set.
- *
- * @param relation Relation to process
- * @return Outlier detection result
- */
- public OutlierResult run(Relation<V> relation) {
- if (sampleSize > 0) {
- return getFastRanking(relation);
- } else {
- return getRanking(relation);
}
+ // Sample variance probably would be correct, but the ABOD publication
+ // uses the naive variance.
+ final double abof = s.getNaiveVariance();
+ return abof;
}
@Override
@@ -572,45 +190,29 @@ public class ABOD<V extends NumberVector<?>> extends AbstractDistanceBasedAlgori
*
* @apiviz.exclude
*/
- public static class Parameterizer<V extends NumberVector<?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<V, DoubleDistance> {
- /**
- * k Parameter.
- */
- protected int k = 0;
-
+ public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
/**
- * Sample size.
+ * Parameter for the kernel function.
*/
- protected int sampleSize = 0;
+ public static final OptionID KERNEL_FUNCTION_ID = new OptionID("abod.kernelfunction", "Kernel function to use.");
/**
* Distance function.
*/
- protected PrimitiveSimilarityFunction<V, DoubleDistance> primitiveKernelFunction = null;
+ protected SimilarityFunction<V, DoubleDistance> kernelFunction = null;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- final IntParameter kP = new IntParameter(K_ID, 30);
- kP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(kP)) {
- k = kP.getValue();
- }
- final IntParameter sampleSizeP = new IntParameter(FAST_SAMPLE_ID);
- sampleSizeP.addConstraint(new GreaterEqualConstraint(1));
- sampleSizeP.setOptional(true);
- if (config.grab(sampleSizeP)) {
- sampleSize = sampleSizeP.getValue();
- }
- final ObjectParameter<PrimitiveSimilarityFunction<V, DoubleDistance>> param = new ObjectParameter<>(KERNEL_FUNCTION_ID, PrimitiveSimilarityFunction.class, PolynomialKernelFunction.class);
+ final ObjectParameter<SimilarityFunction<V, DoubleDistance>> param = new ObjectParameter<>(KERNEL_FUNCTION_ID, SimilarityFunction.class, PolynomialKernelFunction.class);
if (config.grab(param)) {
- primitiveKernelFunction = param.instantiateClass(config);
+ kernelFunction = param.instantiateClass(config);
}
}
@Override
protected ABOD<V> makeInstance() {
- return new ABOD<>(k, sampleSize, primitiveKernelFunction, distanceFunction);
+ return new ABOD<>(kernelFunction);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java
index 99356aef..2b12b306 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AbstractAggarwalYuOutlier.java
@@ -38,11 +38,12 @@ import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.IntIntPair;
@@ -161,7 +162,7 @@ public abstract class AbstractAggarwalYuOutlier<V extends NumberVector<?>> exten
protected static double sparsity(final int setsize, final int dbsize, final int k, final double phi) {
// calculate sparsity c
final double f = 1. / phi;
- final double fK = Math.pow(f, k);
+ final double fK = MathUtil.powi(f, k);
final double sC = (setsize - (dbsize * fK)) / Math.sqrt(dbsize * fK * (1 - fK));
return sC;
}
@@ -242,12 +243,12 @@ public abstract class AbstractAggarwalYuOutlier<V extends NumberVector<?>> exten
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterEqualConstraint(2));
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(kP)) {
k = kP.getValue();
}
final IntParameter phiP = new IntParameter(PHI_ID);
- phiP.addConstraint(new GreaterEqualConstraint(2));
+ phiP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(phiP)) {
phi = phiP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java
index 89be0e66..c4e5cc5d 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/AggarwalYuEvolutionary.java
@@ -56,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -132,24 +132,24 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
final int dbsize = relation.size();
ArrayList<ArrayList<DBIDs>> ranges = buildRanges(relation);
- Heap<Individuum>.UnorderedIter individuums = (new EvolutionarySearch(relation, ranges, m, rnd.getRandom())).run();
+ Heap<Individuum>.UnorderedIter individuums = (new EvolutionarySearch(relation, ranges, m, rnd.getSingleThreadedRandom())).run();
WritableDoubleDataStore outlierScore = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
- for (; individuums.valid(); individuums.advance()) {
+ for(; individuums.valid(); individuums.advance()) {
DBIDs ids = computeSubspaceForGene(individuums.get().getGene(), ranges);
double sparsityC = sparsity(ids.size(), dbsize, k, phi);
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
double prev = outlierScore.doubleValue(iter);
- if (Double.isNaN(prev) || sparsityC < prev) {
+ if(Double.isNaN(prev) || sparsityC < prev) {
outlierScore.putDouble(iter, sparsityC);
}
}
}
DoubleMinMax minmax = new DoubleMinMax();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double val = outlierScore.doubleValue(iditer);
- if (Double.isNaN(val)) {
+ if(Double.isNaN(val)) {
outlierScore.putDouble(iditer, 0.0);
val = 0.0;
}
@@ -219,12 +219,12 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
ArrayList<Individuum> pop = initialPopulation(m);
// best Population
TopBoundedHeap<Individuum> bestSol = new TopBoundedHeap<>(m, Collections.reverseOrder());
- for (Individuum ind : pop) {
+ for(Individuum ind : pop) {
bestSol.add(ind);
}
int iterations = 0;
- while (!checkConvergence(pop)) {
+ while(!checkConvergence(pop)) {
Collections.sort(pop);
pop = rouletteRankSelection(pop);
// Crossover
@@ -232,28 +232,28 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
// Mutation with probability 0.25 , 0.25
pop = mutation(pop, 0.5, 0.5);
// Avoid duplicates
- ind: for (Individuum ind : pop) {
- for (Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
- if (it.get().equals(ind)) {
+ ind: for(Individuum ind : pop) {
+ for(Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
+ if(it.get().equals(ind)) {
continue ind;
}
}
bestSol.add(ind);
}
- if (LOG.isDebuggingFinest()) {
+ if(LOG.isDebuggingFinest()) {
StringBuilder buf = new StringBuilder();
buf.append("Top solutions:\n");
- for (Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
+ for(Heap<Individuum>.UnorderedIter it = bestSol.unorderedIter(); it.valid(); it.advance()) {
buf.append(it.get().toString()).append('\n');
}
buf.append("Population:\n");
- for (Individuum ind : pop) {
+ for(Individuum ind : pop) {
buf.append(ind.toString()).append('\n');
}
LOG.debugFinest(buf.toString());
}
iterations++;
- if (iterations > MAX_ITERATIONS) {
+ if(iterations > MAX_ITERATIONS) {
LOG.warning("Maximum iterations reached.");
break;
}
@@ -268,18 +268,18 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
* @return Convergence
*/
private boolean checkConvergence(Collection<Individuum> pop) {
- if (pop.size() == 0) {
+ if(pop.size() == 0) {
return true;
}
// Gene occurrence counter
int[][] occur = new int[dim][phi + 1];
// Count gene occurrences
- for (Individuum ind : pop) {
+ for(Individuum ind : pop) {
int[] gene = ind.getGene();
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
int val = gene[d] + DONT_CARE;
- if (val < 0 || val >= phi + 1) {
+ if(val < 0 || val >= phi + 1) {
LOG.warning("Invalid gene value encountered: " + val + " in " + ind.toString());
continue;
}
@@ -288,20 +288,20 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
}
int conv = (int) (pop.size() * 0.95);
- if (LOG.isDebuggingFine()) {
+ if(LOG.isDebuggingFine()) {
LOG.debugFine("Convergence at " + conv + " of " + pop.size() + " individuums.");
}
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
boolean converged = false;
- for (int val = 0; val < phi + 1; val++) {
- if (occur[d][val] >= conv) {
+ for(int val = 0; val < phi + 1; val++) {
+ if(occur[d][val] >= conv) {
converged = true;
break;
}
}
// A single failure to converge is sufficient to continue.
- if (!converged) {
+ if(!converged) {
return false;
}
}
@@ -318,19 +318,19 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
// Initial Population
ArrayList<Individuum> population = new ArrayList<>(popsize);
// fill population
- for (int i = 0; i < popsize; i++) {
+ for(int i = 0; i < popsize; i++) {
// Random Individual
int[] gene = new int[dim];
// fill don't care ( any dimension == don't care)
- for (int j = 0; j < dim; j++) {
+ for(int j = 0; j < dim; j++) {
gene[j] = DONT_CARE;
}
// count of don't care positions
int countDim = k;
// fill non don't care positions of the Individual
- while (countDim > 0) {
+ while(countDim > 0) {
int z = random.nextInt(dim);
- if (gene[z] == DONT_CARE) {
+ if(gene[z] == DONT_CARE) {
gene[z] = random.nextInt(phi) + 1;
countDim--;
}
@@ -361,20 +361,21 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
ArrayList<Individuum> survivors = new ArrayList<>(popsize);
// position of selection
- for (int i = 0; i < popsize; i++) {
+ for(int i = 0; i < popsize; i++) {
int z = random.nextInt(totalweight);
- for (int j = 0; j < popsize; j++) {
- if (z < popsize - j) {
+ for(int j = 0; j < popsize; j++) {
+ if(z < popsize - j) {
// TODO: need clone?
survivors.add(population.get(j));
break;
- } else {
+ }
+ else {
// decrement
z -= (popsize - j);
}
}
}
- if (survivors.size() != popsize) {
+ if(survivors.size() != popsize) {
throw new AbortException("Selection step failed - implementation error?");
}
// Don't sort, to avoid biasing the crossover!
@@ -394,23 +395,24 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
TreeSet<Integer> R = new TreeSet<>();
// for each individuum
- for (int j = 0; j < population.size(); j++) {
+ for(int j = 0; j < population.size(); j++) {
// clear the Sets
Q.clear();
R.clear();
// Fill the Sets with the Positions
- for (int i = 0; i < dim; i++) {
- if (population.get(j).getGene()[i] == DONT_CARE) {
+ for(int i = 0; i < dim; i++) {
+ if(population.get(j).getGene()[i] == DONT_CARE) {
Q.add(i);
- } else {
+ }
+ else {
R.add(i);
}
}
//
double r1 = random.nextDouble();
- if (Q.size() != 0) {
+ if(Q.size() != 0) {
// Mutation Variant 1
- if (r1 <= perc1) {
+ if(r1 <= perc1) {
// calc Mutation Spot
Integer[] pos = new Integer[Q.size()];
pos = Q.toArray(pos);
@@ -435,7 +437,7 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
}
r1 = random.nextDouble();
// Mutation Variant 2
- if (r1 <= perc2) {
+ if(r1 <= perc2) {
// calc Mutation Spot
Integer[] pos = new Integer[R.size()];
pos = R.toArray(pos);
@@ -471,14 +473,14 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
// Crossover Set of population Set
ArrayList<Individuum> crossover = new ArrayList<>();
- for (int i = 0; i < population.size() - 1; i += 2) {
+ for(int i = 0; i < population.size() - 1; i += 2) {
Pair<Individuum, Individuum> recombine = recombineOptimized(population.get(i), population.get(i + 1));
// add the Solutions to the new Set
crossover.add(recombine.getFirst());
crossover.add(recombine.getSecond());
}
// if the set contains an odd number of Subspaces, retain the last one
- if (population.size() % 2 == 1) {
+ if(population.size() % 2 == 1) {
crossover.add(population.get(population.size() - 1));
}
// Collections.sort(crossover);
@@ -499,14 +501,14 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
// Set of Positions in which neither s1 or s2 is don't care
ArrayList<Integer> R = new ArrayList<>(dim);
- for (int i = 0; i < dim; i++) {
- if ((parent1.getGene()[i] == DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
+ for(int i = 0; i < dim; i++) {
+ if((parent1.getGene()[i] == DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
Q.add(i);
}
- if ((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] == DONT_CARE)) {
+ if((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] == DONT_CARE)) {
Q.add(i);
}
- if ((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
+ if((parent1.getGene()[i] != DONT_CARE) && (parent2.getGene()[i] != DONT_CARE)) {
R.add(i);
}
}
@@ -518,11 +520,11 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
int count = k - R.size();
Iterator<Integer> q = Q.iterator();
- while (count > 0) {
+ while(count > 0) {
int[] l1 = b.clone();
int[] l2 = b.clone();
- while (q.hasNext()) {
+ while(q.hasNext()) {
int next = q.next();
// pos = next;
@@ -536,14 +538,15 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
final double sparsityL1 = sparsity(computeSubspaceForGene(l1, ranges).size(), dbsize, k, phi);
final double sparsityL2 = sparsity(computeSubspaceForGene(l2, ranges).size(), dbsize, k, phi);
- if (sparsityL1 <= sparsityL2) {
+ if(sparsityL1 <= sparsityL2) {
b = l1.clone();
- if (s1Null) {
+ if(s1Null) {
count--;
}
- } else {
+ }
+ else {
b = l2.clone();
- if (s2Null) {
+ if(s2Null) {
count--;
}
}
@@ -555,10 +558,11 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
// create the complementary String
int[] comp = new int[dim];
- for (int i = 0; i < dim; i++) {
- if (b[i] == parent1.getGene()[i]) {
+ for(int i = 0; i < dim; i++) {
+ if(b[i] == parent1.getGene()[i]) {
comp[i] = parent2.getGene()[i];
- } else {
+ }
+ else {
comp[i] = parent2.getGene()[i];
}
}
@@ -581,7 +585,7 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
* @return best gene combination
*/
private Individuum combineRecursive(ArrayList<Integer> r, int i, int[] current, Individuum parent1, Individuum parent2) {
- if (i == r.size()) {
+ if(i == r.size()) {
return makeIndividuum(current);
}
// Position to modify
@@ -594,9 +598,10 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
Individuum i1 = combineRecursive(r, i + 1, gene1, parent1, parent2);
Individuum i2 = combineRecursive(r, i + 1, gene2, parent1, parent2);
// Return the better result.
- if (i1.getFitness() < i2.getFitness()) {
+ if(i1.getFitness() < i2.getFitness()) {
return i1;
- } else {
+ }
+ else {
return i2;
}
}
@@ -657,15 +662,15 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
@Override
public boolean equals(Object obj) {
- if (!(obj instanceof Individuum)) {
+ if(!(obj instanceof Individuum)) {
return false;
}
Individuum other = (Individuum) obj;
- if (other.second.length != this.second.length) {
+ if(other.second.length != this.second.length) {
return false;
}
- for (int i = 0; i < this.second.length; i++) {
- if (other.second[i] != this.second[i]) {
+ for(int i = 0; i < this.second.length; i++) {
+ if(other.second[i] != this.second[i]) {
return false;
}
}
@@ -703,12 +708,12 @@ public class AggarwalYuEvolutionary<V extends NumberVector<?>> extends AbstractA
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter mP = new IntParameter(M_ID);
- mP.addConstraint(new GreaterEqualConstraint(2));
- if (config.grab(mP)) {
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(mP)) {
m = mP.getValue();
}
final RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java
index 06168c5a..190d14fe 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/COP.java
@@ -62,11 +62,12 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -194,6 +195,11 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
DistanceDist dist = DistanceDist.CHISQUARED;
/**
+ * Include models in output.
+ */
+ boolean models;
+
+ /**
* Constructor.
*
* @param distanceFunction distance function
@@ -201,13 +207,15 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* @param pca PCA computation method
* @param expect Expected fraction of outliers (for score normalization)
* @param dist Distance distribution model (ChiSquared, Gamma)
+ * @param models Report models
*/
- public COP(DistanceFunction<? super V, D> distanceFunction, int k, PCARunner<V> pca, double expect, DistanceDist dist) {
+ public COP(DistanceFunction<? super V, D> distanceFunction, int k, PCARunner<V> pca, double expect, DistanceDist dist, boolean models) {
super(distanceFunction);
this.k = k;
this.pca = pca;
this.expect = expect;
this.dist = dist;
+ this.models = models;
}
/**
@@ -221,22 +229,26 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
KNNQuery<V, D> knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k + 1);
final int dim = RelationUtil.dimensionality(relation);
- if (k <= dim + 1) {
+ if(k <= dim + 1) {
LOG.warning("PCA is underspecified with a too low k! k should be at much larger than " + dim);
}
WritableDoubleDataStore cop_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
- WritableDataStore<Vector> cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Vector.class);
- WritableIntegerDataStore cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1);
+ WritableDataStore<Vector> cop_err_v = null;
+ WritableIntegerDataStore cop_dim = null;
+ if(models) {
+ cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Vector.class);
+ cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1);
+ }
// compute neighbors of each db object
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", relation.size(), LOG) : null;
- for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
+ for(DBIDIter id = ids.iter(); id.valid(); id.advance()) {
KNNList<D> neighbors = knnQuery.getKNNForDBID(id, k + 1);
ModifiableDBIDs nids = DBIDUtil.newHashSet(neighbors);
nids.remove(id); // Do not use query object
- Vector centroid = Centroid.make(relation, nids).toVector(relation).getColumnVector();
+ Vector centroid = Centroid.make(relation, nids);
Vector relative = relation.get(id).getColumnVector().minusEquals(centroid);
PCAResult pcares = pca.processIds(nids, relation);
@@ -246,17 +258,17 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
double min = Double.POSITIVE_INFINITY;
int vdim = dim;
- switch(dist) {
+ switch(dist){
case CHISQUARED: {
double sqdevs = 0;
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
// Scale with Stddev
double dev = projected.get(d);
// Accumulate
sqdevs += dev * dev / evs[d];
// Evaluate
double score = 1 - ChiSquaredDistribution.cdf(sqdevs, d + 1);
- if (score < min) {
+ if(score < min) {
min = score;
vdim = d + 1;
}
@@ -267,21 +279,21 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
double[][] dists = new double[dim][nids.size()];
int j = 0;
Vector srel = new Vector(dim);
- for (DBIDIter s = nids.iter(); s.valid() && j < nids.size(); s.advance()) {
+ for(DBIDIter s = nids.iter(); s.valid() && j < nids.size(); s.advance()) {
V vec = relation.get(s);
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
srel.set(d, vec.doubleValue(d) - centroid.get(d));
}
Vector serr = evecs.transposeTimes(srel);
double sqdist = 0.0;
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
sqdist += serr.get(d) * serr.get(d) / evs[d];
dists[d][j] = sqdist;
}
j++;
}
double sqdevs = 0;
- for (int d = 0; d < dim; d++) {
+ for(int d = 0; d < dim; d++) {
// Scale with Stddev
final double dev = projected.get(d);
// Accumulate
@@ -290,7 +302,7 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
Arrays.sort(dists[d]);
// Evaluate
double score = 1 - GammaChoiWetteEstimator.STATIC.estimate(dists[d], SHORTENED_ARRAY).cdf(sqdevs);
- if (score < min) {
+ if(score < min) {
min = score;
vdim = d + 1;
}
@@ -301,20 +313,22 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
// Normalize the value
final double prob = expect * (1 - min) / (expect + min);
// Construct the error vector:
- for (int d = vdim; d < dim; d++) {
+ for(int d = vdim; d < dim; d++) {
projected.set(d, 0.0);
}
Vector ev = evecs.times(projected).timesEquals(-1 * prob);
cop_score.putDouble(id, prob);
- cop_err_v.put(id, ev);
- cop_dim.putInt(id, dim + 1 - vdim);
+ if(models) {
+ cop_err_v.put(id, ev);
+ cop_dim.putInt(id, dim + 1 - vdim);
+ }
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
@@ -322,8 +336,10 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
Relation<Double> scoreResult = new MaterializedRelation<>("Correlation Outlier Probabilities", COP_SCORES, TypeUtil.DOUBLE, cop_score, ids);
OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore();
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
- result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP_DIM, TypeUtil.INTEGER, cop_dim, ids));
- result.addChildResult(new MaterializedRelation<>("Error vectors", COP_ERRORVEC, TypeUtil.VECTOR, cop_err_v, ids));
+ if(models) {
+ result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP_DIM, TypeUtil.INTEGER, cop_dim, ids));
+ result.addChildResult(new MaterializedRelation<>("Error vectors", COP_ERRORVEC, TypeUtil.VECTOR, cop_err_v, ids));
+ }
return result;
}
@@ -382,6 +398,16 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
public static final OptionID EXPECT_ID = new OptionID("cop.expect", "Expected share of outliers. Only affect score normalization.");
/**
+ * Include COP error vectors in output.
+ * <p>
+ * Key: {@code -cop.models}
+ *
+ * Default: off
+ * </p>
+ */
+ public static final OptionID MODELS_ID = new OptionID("cop.models", "Include COP models (error vectors) in output. This needs more memory.");
+
+ /**
* Number of neighbors to be considered.
*/
int k;
@@ -401,33 +427,42 @@ public class COP<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
*/
double expect;
+ /**
+ * Include COP models
+ */
+ boolean models = false;
+
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID);
kP.addConstraint(new GreaterConstraint(5));
- if (config.grab(kP)) {
+ if(config.grab(kP)) {
k = kP.intValue();
}
EnumParameter<DistanceDist> distP = new EnumParameter<>(DIST_ID, DistanceDist.class, DistanceDist.GAMMA);
- if (config.grab(distP)) {
+ if(config.grab(distP)) {
dist = distP.getValue();
}
DoubleParameter expectP = new DoubleParameter(EXPECT_ID, 0.001);
- expectP.addConstraint(new GreaterConstraint(0));
- expectP.addConstraint(new LessConstraint(1.0));
- if (config.grab(expectP)) {
+ expectP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ expectP.addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
+ if(config.grab(expectP)) {
expect = expectP.doubleValue();
}
ObjectParameter<PCARunner<V>> pcaP = new ObjectParameter<>(PCARUNNER_ID, PCARunner.class, PCARunner.class);
- if (config.grab(pcaP)) {
+ if(config.grab(pcaP)) {
pca = pcaP.instantiateClass(config);
}
+ Flag modelsF = new Flag(MODELS_ID);
+ if(config.grab(modelsF)) {
+ models = modelsF.isTrue();
+ }
}
@Override
protected COP<V, D> makeInstance() {
- return new COP<>(distanceFunction, k, pca, expect, dist);
+ return new COP<>(distanceFunction, k, pca, expect, dist, models);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java
new file mode 100644
index 00000000..ef782390
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/DWOF.java
@@ -0,0 +1,407 @@
+package de.lmu.ifi.dbs.elki.algorithm.outlier;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
+import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter;
+import de.lmu.ifi.dbs.elki.database.ids.distance.KNNList;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * <p>
+ * Algorithm to compute dynamic-window outlier factors in a database based on a
+ * specified parameter {@link Parameterizer#K_ID} ({@code -dwof.k}).
+ * </p>
+ *
+ * <p>
+ * The parameter {@link Parameterizer#K_ID} specifies the number of the
+ * neighbors to be considered during the calculation of the DWOF score.
+ * </p>
+ *
+ * <p>
+ * All the distance queries -KNN and Range- are determined using the parameter
+ * {@link AbstractDistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}
+ * </p>
+ *
+ * <p>
+ * Reference: <br>
+ * Rana Momtaz, Nesma Mohssen and Mohammad A. Gowayyed: DWOF: A Robust
+ * Density-Based OutlierDetection Approach. <br>
+ * In: Pattern Recognition and Image Analysis , Proc. 6th Iberian Conference,
+ * IbPRIA 2013, Funchal, Madeira, Portugal, June 5-7, 2013.
+ * </p>
+ *
+ * @author Omar Yousry
+ *
+ * @param <O> the type of DatabaseObjects handled by this Algorithm
+ * @param <D> Distance type
+ */
+
+@Title("DWOF: Dynamic Window Outlier Factor")
+@Description("Algorithm to compute dynamic-window outlier factors in a database based on the neighborhood size parameter 'k'")
+@Reference(authors = "R. Momtaz, N. Mohssen, M. A. Gowayyed", title = "DWOF: A Robust Density-Based OutlierDetection Approach", booktitle = "Pattern Recognition and Image Analysis, Proc. 6th Iberian Conference, IbPRIA 2013, Funchal, Madeira, Portugal, 2013.", url = "http://dx.doi.org/10.1007%2F978-3-642-38628-2_61")
+public class DWOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(DWOF.class);
+
+ /**
+ * Holds the value of {@link Parameterizer#K_ID} i.e. Number of neighbors to
+ * consider during the calculation of DWOF scores.
+ */
+ protected int k;
+
+ /**
+ * The radii changing ratio
+ */
+ private double delta = 1.1;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function to use in queries
+ * @param k the value of k
+ * @param delta Radius increase factor
+ */
+ public DWOF(DistanceFunction<? super O, D> distanceFunction, int k, double delta) {
+ super(distanceFunction);
+ this.k = k + 1;
+ this.delta = delta;
+ }
+
+ /**
+ * Performs the Generalized DWOF_SCORE algorithm on the given database by
+ * calling all the other methods in the proper order.
+ *
+ * @param database Database to query
+ * @param relation Data to process
+ * @return new OutlierResult instance
+ */
+ public OutlierResult run(Database database, Relation<O> relation) {
+ final DBIDs ids = relation.getDBIDs();
+ DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
+ // Get k nearest neighbor and range query on the relation.
+ KNNQuery<O, D> knnq = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE);
+ RangeQuery<O, D> rnnQuery = database.getRangeQuery(distFunc, DatabaseQuery.HINT_HEAVY_USE);
+
+ StepProgress stepProg = LOG.isVerbose() ? new StepProgress("DWOF", 2) : null;
+ // DWOF output score storage.
+ WritableDoubleDataStore dwofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB | DataStoreFactory.HINT_HOT, 0.);
+ if(stepProg != null) {
+ stepProg.beginStep(1, "Initializing objects' Radii", LOG);
+ }
+ WritableDoubleDataStore radii = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
+ // Find an initial radius for each object:
+ initializeRadii(ids, knnq, distFunc, radii);
+ WritableIntegerDataStore oldSizes = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT, 1);
+ WritableIntegerDataStore newSizes = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT, 1);
+ int countUnmerged = relation.size();
+ if(stepProg != null) {
+ stepProg.beginStep(2, "Clustering-Evaluating Cycles.", LOG);
+ }
+ IndefiniteProgress clusEvalProgress = LOG.isVerbose() ? new IndefiniteProgress("Evaluating DWOFs", LOG) : null;
+ while(countUnmerged > 0) {
+ if(clusEvalProgress != null) {
+ clusEvalProgress.incrementProcessed(LOG);
+ }
+ // Increase radii
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ radii.putDouble(iter, radii.doubleValue(iter) * delta);
+ }
+ // stores the clustering label for each object
+ WritableDataStore<ModifiableDBIDs> labels = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_TEMP, ModifiableDBIDs.class);
+ // Cluster objects based on the current radius
+ clusterData(ids, rnnQuery, radii, labels);
+ // simple reference swap
+ WritableIntegerDataStore temp = newSizes;
+ newSizes = oldSizes;
+ oldSizes = temp;
+
+ // Update the cluster size count for each object.
+ countUnmerged = updateSizes(ids, labels, newSizes);
+ labels.destroy();
+ // Update DWOF scores.
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ double newScore = (newSizes.intValue(iter) > 0) ? ((double) (oldSizes.intValue(iter) - 1) / (double) newSizes.intValue(iter)) : 0.0;
+ dwofs.putDouble(iter, dwofs.doubleValue(iter) + newScore);
+ }
+ }
+ if(clusEvalProgress != null) {
+ clusEvalProgress.setCompleted(LOG);
+ }
+ if(stepProg != null) {
+ stepProg.setCompleted(LOG);
+ }
+ // Build result representation.
+ DoubleMinMax minmax = new DoubleMinMax();
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ minmax.put(dwofs.doubleValue(iter));
+ }
+ OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY);
+ Relation<Double> rel = new MaterializedRelation<>("Dynamic-Window Outlier Factors", "dwof-outlier", TypeUtil.DOUBLE, dwofs, ids);
+ return new OutlierResult(meta, rel);
+ }
+
+ /**
+ * This method prepares a container for the radii of the objects and
+ * initializes radii according to the equation:
+ *
+ * initialRadii of a certain object = (absoluteMinDist of all objects) *
+ * (avgDist of the object) / (minAvgDist of all objects)
+ *
+ * @param ids Database IDs to process
+ * @param distFunc Distance function
+ * @param knnq kNN search function
+ * @param radii WritableDoubleDataStore to store radii
+ */
+ private void initializeRadii(DBIDs ids, KNNQuery<O, D> knnq, DistanceQuery<O, D> distFunc, WritableDoubleDataStore radii) {
+ FiniteProgress avgDistProgress = LOG.isVerbose() ? new FiniteProgress("Calculating average kNN distances-", ids.size(), LOG) : null;
+ double absoluteMinDist = Double.POSITIVE_INFINITY;
+ double minAvgDist = Double.POSITIVE_INFINITY;
+ // to get the mean for each object
+ Mean mean = new Mean();
+ // Iterate over all objects
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ KNNList<D> iterNeighbors = knnq.getKNNForDBID(iter, k);
+ // skip the point itself
+ mean.reset();
+ for(DBIDIter neighbor1 = iterNeighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
+ if(DBIDUtil.equal(neighbor1, iter)) {
+ continue;
+ }
+ for(DBIDIter neighbor2 = iterNeighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
+ if(DBIDUtil.equal(neighbor1, neighbor2) || DBIDUtil.equal(neighbor2, iter)) {
+ continue;
+ }
+ double distance = distFunc.distance(neighbor1, neighbor2).doubleValue();
+ mean.put(distance);
+ if(distance > 0. && distance < absoluteMinDist) {
+ absoluteMinDist = distance;
+ }
+ }
+ }
+ double currentMean = mean.getMean();
+ radii.putDouble(iter, currentMean);
+ if(currentMean < minAvgDist) {
+ minAvgDist = currentMean;
+ }
+ if(avgDistProgress != null) {
+ avgDistProgress.incrementProcessed(LOG);
+ }
+ }
+ if(avgDistProgress != null) {
+ avgDistProgress.ensureCompleted(LOG);
+ }
+
+ // Initializing the radii of all objects.
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ radii.putDouble(iter, (minAvgDist > 0) ? (absoluteMinDist * radii.doubleValue(iter) / minAvgDist) : Double.POSITIVE_INFINITY);
+ }
+ }
+
+ /**
+ * This method applies a density based clustering algorithm.
+ *
+ * It looks for an unclustered object and builds a new cluster for it, then
+ * adds all the points within its radius to that cluster.
+ *
+ * nChain represents the points to be added to the cluster and its
+ * radius-neighbors
+ *
+ * @param ids Database IDs to process
+ * @param rnnQuery Data to process
+ * @param radii Radii to cluster accordingly
+ * @param labels Label storage.
+ */
+ private void clusterData(DBIDs ids, RangeQuery<O, D> rnnQuery, WritableDoubleDataStore radii, WritableDataStore<ModifiableDBIDs> labels) {
+ FiniteProgress clustProg = LOG.isVerbose() ? new FiniteProgress("Density-Based Clustering", ids.size(), LOG) : null;
+ // Iterate over all objects
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ if(labels.get(iter) != null) {
+ continue;
+ }
+ ModifiableDBIDs newCluster = DBIDUtil.newArray();
+ newCluster.add(iter);
+ labels.put(iter, newCluster);
+ if(clustProg != null) {
+ clustProg.incrementProcessed(LOG);
+ }
+ // container of the points to be added and their radii neighbors to the
+ // cluster
+ ModifiableDBIDs nChain = DBIDUtil.newArray();
+ nChain.add(iter);
+ // iterate over nChain
+ for(DBIDIter toGetNeighbors = nChain.iter(); toGetNeighbors.valid(); toGetNeighbors.advance()) {
+ D range = rnnQuery.getDistanceFactory().fromDouble(radii.doubleValue(toGetNeighbors));
+ DistanceDBIDList<D> nNeighbors = rnnQuery.getRangeForDBID(toGetNeighbors, range);
+ for(DistanceDBIDListIter<D> iter2 = nNeighbors.iter(); iter2.valid(); iter2.advance()) {
+ if(DBIDUtil.equal(toGetNeighbors, iter2)) {
+ continue;
+ }
+ if(labels.get(iter2) == null) {
+ newCluster.add(iter2);
+ labels.put(iter2, newCluster);
+ nChain.add(iter2);
+ if(clustProg != null) {
+ clustProg.incrementProcessed(LOG);
+ }
+ }
+ else if(labels.get(iter2) != newCluster) {
+ ModifiableDBIDs toBeDeleted = labels.get(iter2);
+ newCluster.addDBIDs(toBeDeleted);
+ for(DBIDIter iter3 = toBeDeleted.iter(); iter3.valid(); iter3.advance()) {
+ labels.put(iter3, newCluster);
+ }
+ toBeDeleted.clear();
+ }
+ }
+ }
+ }
+ if(clustProg != null) {
+ clustProg.ensureCompleted(LOG);
+ }
+ }
+
+ /**
+ * This method updates each object's cluster size after the clustering step.
+ *
+ * @param ids Object IDs to process
+ * @param labels references for each object's cluster
+ * @param newSizes the sizes container to be updated
+ * @return the number of unclustered objects
+ */
+ private int updateSizes(DBIDs ids, WritableDataStore<ModifiableDBIDs> labels, WritableIntegerDataStore newSizes) {
+ // to count the unclustered all over
+ int countUnmerged = 0;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ // checking the point's new cluster size after the clustering step
+ int newClusterSize = labels.get(iter).size();
+ newSizes.putInt(iter, newClusterSize);
+ // the point is alone in the cluster --> not merged with other points
+ if(newClusterSize == 1) {
+ countUnmerged++;
+ }
+ }
+ return countUnmerged;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Omar Yousry
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
+ /**
+ * Option ID for the number of neighbors.
+ */
+ public static final OptionID K_ID = OptionID.getOrCreateOptionID("dwof.k", "Number of neighbors to get for DWOF score outlier detection.");
+
+ /**
+ * Option ID for radius increases
+ */
+ public static final OptionID DELTA_ID = OptionID.getOrCreateOptionID("dwof.delta", "Radius increase factor.");
+
+ /**
+ * Number of neighbors to get
+ */
+ protected int k = 2;
+
+ /**
+ * Radius increase factor.
+ */
+ protected double delta = 1.1;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ // The super class has the distance function parameter!
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(K_ID);
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.getValue();
+ }
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID);
+ deltaP.setDefaultValue(1.1);
+ deltaP.addConstraint(CommonConstraints.GREATER_THAN_ONE_DOUBLE);
+ if(config.grab(deltaP)) {
+ delta = deltaP.getValue();
+ }
+ }
+
+ @Override
+ protected DWOF<O, D> makeInstance() {
+ return new DWOF<>(distanceFunction, k, delta);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java
index f8fd686f..76191cf2 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/EMOutlier.java
@@ -38,10 +38,12 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.result.Result;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.Hierarchy.Iter;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
@@ -91,15 +93,27 @@ public class EMOutlier<V extends NumberVector<?>> extends AbstractAlgorithm<Outl
* @return Outlier result
*/
public OutlierResult run(Database database, Relation<V> relation) {
+ emClustering.setSoft(true);
Clustering<EMModel<V>> emresult = emClustering.run(database, relation);
+ Relation<double[]> soft = null;
+ for (Iter<Result> iter = emresult.getHierarchy().iterChildren(emresult); iter.valid(); iter.advance()) {
+ if (!(iter.get() instanceof Relation)) {
+ continue;
+ }
+ if (((Relation<?>) iter.get()).getDataTypeInformation() == EM.SOFT_TYPE) {
+ @SuppressWarnings("unchecked")
+ Relation<double[]> rel = (Relation<double[]>) iter.get();
+ soft = rel;
+ }
+ }
double globmax = 0.0;
WritableDoubleDataStore emo_score = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double maxProb = Double.POSITIVE_INFINITY;
- double[] probs = emClustering.getProbClusterIGivenX(iditer);
- for(double prob : probs) {
- maxProb = Math.min(1 - prob, maxProb);
+ double[] probs = soft.get(iditer);
+ for (double prob : probs) {
+ maxProb = Math.min(1. - prob, maxProb);
}
emo_score.putDouble(iditer, maxProb);
globmax = Math.max(maxProb, globmax);
@@ -145,4 +159,4 @@ public class EMOutlier<V extends NumberVector<?>> extends AbstractAlgorithm<Outl
return new EMOutlier<>(em);
}
}
-} \ No newline at end of file
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java
new file mode 100644
index 00000000..ee6bd434
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/FastABOD.java
@@ -0,0 +1,219 @@
+package de.lmu.ifi.dbs.elki.algorithm.outlier;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
+import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * Angle-Based Outlier Detection / Angle-Based Outlier Factor.
+ *
+ * Fast-ABOD (approximateABOF) version.
+ *
+ * H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in
+ * High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge
+ * Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008.
+ *
+ * @author Matthias Schubert (Original Code)
+ * @author Erich Schubert (ELKIfication)
+ *
+ * @param <V> Vector type
+ */
+@Title("Approximate ABOD: Angle-Based Outlier Detection")
+@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.")
+@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946")
+public class FastABOD<V extends NumberVector<?>> extends ABOD<V> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(FastABOD.class);
+
+ /**
+ * Number of nearest neighbors.
+ */
+ protected int k;
+
+ /**
+ * Constructor for Angle-Based Outlier Detection (ABOD).
+ *
+ * @param kernelFunction kernel function to use
+ * @param k Number of nearest neighbors
+ */
+ public FastABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction, int k) {
+ super(kernelFunction);
+ this.k = k;
+ }
+
+ /**
+ * Run Fast-ABOD on the data set.
+ *
+ * @param relation Relation to process
+ * @return Outlier detection result
+ */
+ @Override
+ public OutlierResult run(Database db, Relation<V> relation) {
+ DBIDs ids = relation.getDBIDs();
+ // Build a kernel matrix, to make O(n^3) slightly less bad.
+ SimilarityQuery<V, DoubleDistance> sq = db.getSimilarityQuery(relation, kernelFunction);
+ KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
+
+ WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
+ DoubleMinMax minmaxabod = new DoubleMinMax();
+
+ MeanVariance s = new MeanVariance();
+ for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) {
+ s.reset();
+ final double simAA = kernelMatrix.getSimilarity(pA, pA);
+
+ // Choose the k-min nearest
+ ComparableMaxHeap<DoubleDBIDPair> nn = new ComparableMaxHeap<>(k);
+ for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
+ if (DBIDUtil.equal(nB, pA)) {
+ continue;
+ }
+ double simBB = kernelMatrix.getSimilarity(nB, nB);
+ double simAB = kernelMatrix.getSimilarity(pA, nB);
+ double sqdAB = simAA + simBB - simAB - simAB;
+ if (!(sqdAB > 0.)) {
+ continue;
+ }
+ if (nn.size() < k) {
+ nn.add(DBIDUtil.newPair(sqdAB, nB));
+ } else if (sqdAB < nn.peek().doubleValue()) {
+ nn.replaceTopElement(DBIDUtil.newPair(sqdAB, nB));
+ }
+ }
+
+ for (ObjectHeap.UnsortedIter<DoubleDBIDPair> iB = nn.unsortedIter(); iB.valid(); iB.advance()) {
+ DoubleDBIDPair nB = iB.get();
+ double sqdAB = nB.doubleValue();
+ double simAB = kernelMatrix.getSimilarity(pA, nB);
+ if (!(sqdAB > 0.)) {
+ continue;
+ }
+ for (ObjectHeap.UnsortedIter<DoubleDBIDPair> iC = nn.unsortedIter(); iC.valid(); iC.advance()) {
+ DoubleDBIDPair nC = iC.get();
+ if (DBIDUtil.compare(nC, nB) < 0) {
+ continue;
+ }
+ double sqdAC = nC.doubleValue();
+ double simAC = kernelMatrix.getSimilarity(pA, nC);
+ if (!(sqdAC > 0.)) {
+ continue;
+ }
+ // Exploit bilinearity of scalar product:
+ // <B-A, C-A> = <B, C-A> - <A,C-A>
+ // = <B,C> - <B,A> - <A,C> + <A,A>
+ // For computing variance, AA is a constant and can be ignored.
+ double simBC = kernelMatrix.getSimilarity(nB, nC);
+ double numerator = simBC - simAB - simAC; // + simAA;
+ double val = numerator / (sqdAB * sqdAC);
+ s.put(val, 1. / Math.sqrt(sqdAB * sqdAC));
+ }
+ }
+ // Sample variance probably would be correct, but the ABOD publication
+ // uses the naive variance.
+ final double abof = s.getNaiveVariance();
+ minmaxabod.put(abof);
+ abodvalues.putDouble(pA, abof);
+ }
+
+ // Build result representation.
+ Relation<Double> scoreResult = new MaterializedRelation<>("Angle-Based Outlier Degree", "abod-outlier", TypeUtil.DOUBLE, abodvalues, relation.getDBIDs());
+ OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
+ return new OutlierResult(scoreMeta, scoreResult);
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends ABOD.Parameterizer<V> {
+ /**
+ * Parameter for the nearest neighbors.
+ */
+ public static final OptionID K_ID = new OptionID("fastabod.k", "Number of nearest neighbors to use for ABOD.");
+
+ /**
+ * Number of neighbors.
+ */
+ protected int k;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final IntParameter kP = new IntParameter(K_ID);
+ if (config.grab(kP)) {
+ k = kP.intValue();
+ }
+ }
+
+ @Override
+ protected FastABOD<V> makeInstance() {
+ return new FastABOD<>(kernelFunction, k);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java
index c9e6a634..3f8bb484 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianModel.java
@@ -112,7 +112,7 @@ public class GaussianModel<V extends NumberVector<?>> extends AbstractAlgorithm<
Matrix covarianceTransposed = covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse();
// Normalization factors for Gaussian PDF
- final double fakt = (1.0 / (Math.sqrt(Math.pow(MathUtil.TWOPI, RelationUtil.dimensionality(relation)) * covarianceMatrix.det())));
+ final double fakt = (1.0 / (Math.sqrt(MathUtil.powi(MathUtil.TWOPI, RelationUtil.dimensionality(relation)) * covarianceMatrix.det())));
// for each object compute Mahalanobis distance
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java
index 294592e8..e6659a8f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/GaussianUniformMixture.java
@@ -219,7 +219,7 @@ public class GaussianUniformMixture<V extends NumberVector<?>> extends AbstractA
Matrix covInv = covarianceMatrix.cheatToAvoidSingularity(SINGULARITY_CHEAT).inverse();
double covarianceDet = covarianceMatrix.det();
- double fakt = 1.0 / Math.sqrt(Math.pow(MathUtil.TWOPI, RelationUtil.dimensionality(database)) * covarianceDet);
+ double fakt = 1.0 / Math.sqrt(MathUtil.powi(MathUtil.TWOPI, RelationUtil.dimensionality(database)) * covarianceDet);
// for each object compute probability and sum
double prob = 0;
for (DBIDIter iter = objids.iter(); iter.valid(); iter.advance()) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java
new file mode 100644
index 00000000..37b4d050
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/LBABOD.java
@@ -0,0 +1,288 @@
+package de.lmu.ifi.dbs.elki.algorithm.outlier;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
+import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
+import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.Logging.Level;
+import de.lmu.ifi.dbs.elki.logging.LoggingConfiguration;
+import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
+import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.MeanVariance;
+import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
+import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMaxHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * Angle-Based Outlier Detection / Angle-Based Outlier Factor.
+ *
+ * LB-ABOD (lower-bound) version. Exact on the top k outliers, approximate on
+ * the remaining.
+ *
+ * Outlier detection using variance analysis on angles, especially for high
+ * dimensional data sets.
+ *
+ * H.-P. Kriegel, M. Schubert, and A. Zimek: Angle-Based Outlier Detection in
+ * High-dimensional Data. In: Proc. 14th ACM SIGKDD Int. Conf. on Knowledge
+ * Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008.
+ *
+ * @author Matthias Schubert (Original Code)
+ * @author Erich Schubert (ELKIfication)
+ *
+ * @param <V> Vector type
+ */
+@Title("LB-ABOD: Lower Bounded Angle-Based Outlier Detection")
+@Description("Outlier detection using variance analysis on angles, especially for high dimensional data sets.")
+@Reference(authors = "H.-P. Kriegel, M. Schubert, and A. Zimek", title = "Angle-Based Outlier Detection in High-dimensional Data", booktitle = "Proc. 14th ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining (KDD '08), Las Vegas, NV, 2008", url = "http://dx.doi.org/10.1145/1401890.1401946")
+public class LBABOD<V extends NumberVector<?>> extends FastABOD<V> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(LBABOD.class);
+
+ /**
+ * Number of outliers to refine.
+ */
+ protected int l;
+
+ /**
+ * Actual constructor, with parameters. Fast mode (sampling).
+ *
+ * @param kernelFunction Kernel function to use
+ * @param k k parameter
+ * @param l Number of outliers to find exact
+ */
+ public LBABOD(SimilarityFunction<? super V, DoubleDistance> kernelFunction, int k, int l) {
+ super(kernelFunction, k);
+ this.l = l;
+ }
+
+ /**
+ * Run LB-ABOD on the data set.
+ *
+ * @param relation Relation to process
+ * @return Outlier detection result
+ */
+ @Override
+ public OutlierResult run(Database db, Relation<V> relation) {
+ DBIDs ids = relation.getDBIDs();
+ SimilarityQuery<V, DoubleDistance> sq = relation.getDatabase().getSimilarityQuery(relation, kernelFunction);
+ KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
+
+ // Output storage.
+ WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
+ DoubleMinMax minmaxabod = new DoubleMinMax();
+ double max = 0.;
+
+ // Storage for squared distances (will be reused!)
+ WritableDoubleDataStore sqDists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
+ // Nearest neighbor heap (will be reused!)
+ ComparableMaxHeap<DoubleDBIDPair> nn = new ComparableMaxHeap<>(k);
+
+ // Priority queue for candidates
+ ComparableMinHeap<DoubleDBIDPair> candidates = new ComparableMinHeap<>(relation.size());
+ // get Candidate Ranking
+ for(DBIDIter pA = relation.iterDBIDs(); pA.valid(); pA.advance()) {
+ // Compute nearest neighbors and distances.
+ nn.clear();
+ double simAA = kernelMatrix.getSimilarity(pA, pA);
+ // Sum of 1./(|AB|) and 1./(|AB|^2); for computing R2.
+ double sumid = 0., sumisqd = 0.;
+ for(DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
+ if(DBIDUtil.equal(nB, pA)) {
+ continue;
+ }
+ double simBB = kernelMatrix.getSimilarity(nB, nB);
+ double simAB = kernelMatrix.getSimilarity(pA, nB);
+ double sqdAB = simAA + simBB - simAB - simAB;
+ sqDists.putDouble(nB, sqdAB);
+ if(!(sqdAB > 0.)) {
+ continue;
+ }
+ sumid += 1. / Math.sqrt(sqdAB);
+ sumisqd += 1. / sqdAB;
+ // Update heap
+ if(nn.size() < k) {
+ nn.add(DBIDUtil.newPair(sqdAB, nB));
+ }
+ else if(sqdAB < nn.peek().doubleValue()) {
+ nn.replaceTopElement(DBIDUtil.newPair(sqdAB, nB));
+ }
+ }
+
+ // Compute FastABOD approximation, adjust for lower bound.
+ // LB-ABOF is defined via a numerically unstable formula.
+ // Variance as E(X^2)-E(X)^2 suffers from catastrophic cancellation!
+ // TODO: ensure numerical precision!
+ double nnsum = 0., nnsumsq = 0., nnsumisqd = 0.;
+ for(ObjectHeap.UnsortedIter<DoubleDBIDPair> iB = nn.unsortedIter(); iB.valid(); iB.advance()) {
+ DoubleDBIDPair nB = iB.get();
+ double sqdAB = nB.doubleValue();
+ double simAB = kernelMatrix.getSimilarity(pA, nB);
+ if(!(sqdAB > 0.)) {
+ continue;
+ }
+ for(ObjectHeap.UnsortedIter<DoubleDBIDPair> iC = nn.unsortedIter(); iC.valid(); iC.advance()) {
+ DoubleDBIDPair nC = iC.get();
+ if(DBIDUtil.compare(nC, nB) < 0) {
+ continue;
+ }
+ double sqdAC = nC.doubleValue();
+ double simAC = kernelMatrix.getSimilarity(pA, nC);
+ if(!(sqdAC > 0.)) {
+ continue;
+ }
+ // Exploit bilinearity of scalar product:
+ // <B-A, C-A> = <B, C-A> - <A,C-A>
+ // = <B,C> - <B,A> - <A,C> + <A,A>
+ double simBC = kernelMatrix.getSimilarity(nB, nC);
+ double numerator = simBC - simAB - simAC + simAA;
+ double sqweight = 1. / (sqdAB * sqdAC);
+ double weight = Math.sqrt(sqweight);
+ double val = numerator * sqweight;
+ nnsum += val * weight;
+ nnsumsq += val * val * weight;
+ nnsumisqd += sqweight;
+ }
+ }
+ // Remaining weight, term R2:
+ double r2 = sumisqd * sumisqd - 2. * nnsumisqd;
+ double tmp = (2. * nnsum + r2) / (sumid * sumid);
+ double lbabof = 2. * nnsumsq / (sumid * sumid) - tmp * tmp;
+
+ // Track maximum?
+ if(lbabof > max) {
+ max = lbabof;
+ }
+ abodvalues.putDouble(pA, lbabof);
+ candidates.add(DBIDUtil.newPair(lbabof, pA));
+ }
+ minmaxabod.put(max); // Put maximum from approximate values.
+
+ // refine Candidates
+ int refinements = 0;
+ DoubleMinHeap topscores = new DoubleMinHeap(l);
+ MeanVariance s = new MeanVariance();
+ while(!candidates.isEmpty()) {
+ // Stop refining
+ if(topscores.size() >= k && candidates.peek().doubleValue() > topscores.peek()) {
+ break;
+ }
+ DoubleDBIDPair pA = candidates.poll();
+ final double abof = computeABOF(relation, kernelMatrix, pA, s);
+ // Store refined score:
+ abodvalues.putDouble(pA, abof);
+ minmaxabod.put(abof);
+ // Update the heap tracking the top scores.
+ if(topscores.size() < k) {
+ topscores.add(abof);
+ }
+ else {
+ if(topscores.peek() > abof) {
+ topscores.replaceTopElement(abof);
+ }
+ }
+ refinements += 1;
+ }
+ if(LOG.isStatistics()) {
+ LoggingConfiguration.setVerbose(Level.VERYVERBOSE);
+ LOG.statistics(new LongStatistic("lb-abod.refinements", refinements));
+ }
+ // Build result representation.
+ Relation<Double> scoreResult = new MaterializedRelation<>("Angle-based Outlier Detection", "abod-outlier", TypeUtil.DOUBLE, abodvalues, ids);
+ OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
+ return new OutlierResult(scoreMeta, scoreResult);
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V extends NumberVector<?>> extends FastABOD.Parameterizer<V> {
+ /**
+ * Parameter to specify the number of outliers to compute exactly.
+ */
+ public static final OptionID L_ID = new OptionID("abod.l", "Number of top outliers to compute.");
+
+ /**
+ * Number of outliers to find.
+ */
+ protected int l = 0;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final IntParameter lP = new IntParameter(L_ID);
+ lP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(lP)) {
+ l = lP.getValue();
+ }
+ }
+
+ @Override
+ protected LBABOD<V> makeInstance() {
+ return new LBABOD<>(kernelFunction, k, l);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java
index f22cdeb7..a5b39146 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ODIN.java
@@ -45,7 +45,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -110,19 +110,19 @@ public class ODIN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorit
double inc = 1. / (k - 1);
double min = Double.POSITIVE_INFINITY, max = 0.0;
// Process all objects
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
// Find the nearest neighbors (using an index, if available!)
KNNList<D> neighbors = knnq.getKNNForDBID(iter, k);
// For each neighbor, except ourselves, increase the in-degree:
- for (DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) {
- if (DBIDUtil.equal(iter, nei)) {
+ for(DBIDIter nei = neighbors.iter(); nei.valid(); nei.advance()) {
+ if(DBIDUtil.equal(iter, nei)) {
continue;
}
final double value = scores.doubleValue(nei) + inc;
- if (value < min) {
+ if(value < min) {
min = value;
}
- if (value > max) {
+ if(value > max) {
max = value;
}
scores.put(nei, value);
@@ -178,8 +178,8 @@ public class ODIN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorit
// Since in a database context, the 1 nearest neighbor
// will usually be the query object itself, we require
// this value to be at least 2.
- param.addConstraint(new GreaterConstraint(1));
- if (config.grab(param)) {
+ param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(param)) {
k = param.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java
index f6d46f57..b1ffae63 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/OPTICSOF.java
@@ -1,26 +1,27 @@
package de.lmu.ifi.dbs.elki.algorithm.outlier;
-/*
-This file is part of ELKI:
-Environment for Developing KDD-Applications Supported by Index-Structures
-
-Copyright (C) 2013
-Ludwig-Maximilians-Universität München
-Lehr- und Forschungseinheit für Datenbanksysteme
-ELKI Development Team
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.ArrayList;
import java.util.List;
@@ -54,7 +55,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -122,7 +123,7 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc
// Pass 1
// N_minpts(id) and core-distance(id)
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNList<D> minptsNeighbours = knnQuery.getKNNForDBID(iditer, minpts);
D d = minptsNeighbours.getKNNDistance();
nMinPts.put(iditer, minptsNeighbours);
@@ -133,11 +134,11 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc
// Pass 2
WritableDataStore<List<Double>> reachDistance = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, List.class);
WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
List<Double> core = new ArrayList<>();
double lrd = 0;
// TODO: optimize for double distances
- for (DistanceDBIDListIter<D> neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
+ for(DistanceDBIDListIter<D> neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
double coreDist = coreDistance.doubleValue(neighbor);
double dist = distQuery.distance(iditer, neighbor).doubleValue();
double rd = Math.max(coreDist, dist);
@@ -152,9 +153,9 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc
// Pass 3
DoubleMinMax ofminmax = new DoubleMinMax();
WritableDoubleDataStore ofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double of = 0;
- for (DBIDIter neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
+ for(DBIDIter neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
double lrd = lrds.doubleValue(iditer);
double lrdN = lrds.doubleValue(neighbor);
of = of + lrdN / lrd;
@@ -169,7 +170,7 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(ofminmax.getMin(), ofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
return new OutlierResult(scoreMeta, scoreResult);
}
-
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
@@ -181,11 +182,11 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc
}
/**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
*/
public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
protected int minpts = 0;
@@ -194,7 +195,7 @@ public class OPTICSOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanc
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter param = new IntParameter(OPTICS.MINPTS_ID);
- param.addConstraint(new GreaterConstraint(1));
+ param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(param)) {
minpts = param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java
index 092bbc45..d254c9a1 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/ReferenceBasedOutlierDetection.java
@@ -56,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -182,14 +182,14 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?>, D extends
}
// compute maximum density
double maxDensity = 0.0;
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double dens = rbod_score.doubleValue(iditer);
if(dens > maxDensity) {
maxDensity = dens;
}
}
// compute ROS
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = 1 - (rbod_score.doubleValue(iditer) / maxDensity);
rbod_score.putDouble(iditer, score);
}
@@ -218,7 +218,7 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?>, D extends
protected DistanceDBIDList<D> computeDistanceVector(V refPoint, Relation<V> database, DistanceQuery<V, D> distFunc) {
// TODO: optimize for double distances?
GenericDistanceDBIDList<D> referenceDists = new GenericDistanceDBIDList<>(database.size());
- for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) {
referenceDists.add(distFunc.distance(iditer, refPoint), iditer);
}
referenceDists.sort();
@@ -319,7 +319,7 @@ public class ReferenceBasedOutlierDetection<V extends NumberVector<?>, D extends
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter pK = new IntParameter(K_ID);
- pK.addConstraint(new GreaterConstraint(1));
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(pK)) {
k = pK.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java
index 38820ab7..72a727a5 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/SimpleCOP.java
@@ -60,7 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -123,7 +123,7 @@ public class SimpleCOP<V extends NumberVector<?>, D extends NumberDistance<D, ?>
{// compute neighbors of each db object
FiniteProgress progressLocalPCA = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", data.size(), LOG) : null;
double sqrt2 = Math.sqrt(2.0);
- for (DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) {
+ for(DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) {
KNNList<D> neighbors = knnQuery.getKNNForDBID(id, k + 1);
ModifiableDBIDs nids = DBIDUtil.newArray(neighbors);
nids.remove(id);
@@ -147,11 +147,11 @@ public class SimpleCOP<V extends NumberVector<?>, D extends NumberDistance<D, ?>
cop_sol.put(id, depsol);
- if (progressLocalPCA != null) {
+ if(progressLocalPCA != null) {
progressLocalPCA.incrementProcessed(LOG);
}
}
- if (progressLocalPCA != null) {
+ if(progressLocalPCA != null) {
progressLocalPCA.ensureCompleted(LOG);
}
}
@@ -218,12 +218,12 @@ public class SimpleCOP<V extends NumberVector<?>, D extends NumberDistance<D, ?>
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(0));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
ObjectParameter<PCAFilteredRunner<V>> pcaP = new ObjectParameter<>(PCARUNNER_ID, PCAFilteredRunner.class, PCAFilteredRunner.class);
- if (config.grab(pcaP)) {
+ if(config.grab(pcaP)) {
pca = pcaP.instantiateClass(config);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java
index d48679a9..f978365e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/ALOCI.java
@@ -141,7 +141,7 @@ public class ALOCI<O extends NumberVector<?>, D extends NumberDistance<D, ?>> ex
public OutlierResult run(Database database, Relation<O> relation) {
final int dim = RelationUtil.dimensionality(relation);
- final Random random = rnd.getRandom();
+ final Random random = rnd.getSingleThreadedRandom();
FiniteProgress progressPreproc = LOG.isVerbose() ? new FiniteProgress("Build aLOCI quadtress", g, LOG) : null;
// Compute extend of dataset.
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java
index 80f60e8b..2508b6b0 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/FlexibleLOF.java
@@ -64,7 +64,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -581,14 +581,14 @@ public class FlexibleLOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgo
super.makeOptions(config);
final IntParameter pK = new IntParameter(KREF_ID);
- pK.addConstraint(new GreaterConstraint(1));
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if (config.grab(pK)) {
krefer = pK.intValue();
}
final IntParameter pK2 = new IntParameter(KREACH_ID);
pK2.setOptional(true);
- pK2.addConstraint(new GreaterConstraint(1));
+ pK2.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if (config.grab(pK2)) {
kreach = pK2.intValue();
} else {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java
index ae297a3c..28fcf01b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/INFLO.java
@@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -142,7 +142,7 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
// density
WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// init knns and rnns
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
knns.put(iditer, DBIDUtil.newArray());
rnns.put(iditer, DBIDUtil.newArray());
}
@@ -150,10 +150,10 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
// TODO: use kNN preprocessor?
KNNQuery<O, D> knnQuery = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE);
- for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
+ for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
// if not visited count=0
int count = rnns.get(id).size();
- if (!processedIDs.contains(id)) {
+ if(!processedIDs.contains(id)) {
// TODO: use exactly k neighbors?
KNNList<D> list = knnQuery.getKNNForDBID(id, k);
knns.get(id).addDBIDs(list);
@@ -162,8 +162,8 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
}
ModifiableDBIDs s = knns.get(id);
- for (DBIDIter q = knns.get(id).iter(); q.valid(); q.advance()) {
- if (!processedIDs.contains(q)) {
+ for(DBIDIter q = knns.get(id).iter(); q.valid(); q.advance()) {
+ if(!processedIDs.contains(q)) {
// TODO: use exactly k neighbors?
KNNList<D> listQ = knnQuery.getKNNForDBID(q, k);
knns.get(q).addDBIDs(listQ);
@@ -171,13 +171,13 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
processedIDs.add(q);
}
- if (knns.get(q).contains(id)) {
+ if(knns.get(q).contains(id)) {
rnns.get(q).add(id);
rnns.get(id).add(q);
count++;
}
}
- if (count >= s.size() * m) {
+ if(count >= s.size() * m) {
pruned.add(id);
}
}
@@ -186,15 +186,15 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
// IF Object is pruned INFLO=1.0
DoubleMinMax inflominmax = new DoubleMinMax();
WritableDoubleDataStore inflos = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
- for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
- if (!pruned.contains(id)) {
+ for(DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
+ if(!pruned.contains(id)) {
ModifiableDBIDs knn = knns.get(id);
ModifiableDBIDs rnn = rnns.get(id);
double denP = density.doubleValue(id);
knn.addDBIDs(rnn);
Mean mean = new Mean();
- for (DBIDIter iter = knn.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = knn.iter(); iter.valid(); iter.advance()) {
mean.put(density.doubleValue(iter));
}
double den = mean.getMean() / denP;
@@ -203,7 +203,7 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
inflominmax.put(den);
}
- if (pruned.contains(id)) {
+ if(pruned.contains(id)) {
inflos.putDouble(id, 1.0);
inflominmax.put(1.0);
}
@@ -241,14 +241,14 @@ public class INFLO<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBa
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final DoubleParameter mP = new DoubleParameter(M_ID, 1.0);
- mP.addConstraint(new GreaterConstraint(0.0));
- if (config.grab(mP)) {
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(mP)) {
m = mP.doubleValue();
}
final IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(1));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(kP)) {
k = kP.intValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java
index 4a86e93d..e5049877 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDF.java
@@ -55,6 +55,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.GaussianKernelDensityFunction;
import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
@@ -62,7 +63,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -149,8 +150,8 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
// "HEAVY" flag for KNN Query since it is used more than once
KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
// No optimized kNN query - use a preprocessor!
- if (!(knnq instanceof PreprocessorKNNQuery)) {
- if (stepprog != null) {
+ if(!(knnq instanceof PreprocessorKNNQuery)) {
+ if(stepprog != null) {
stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG);
}
MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k);
@@ -160,43 +161,46 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
}
// Compute LDEs
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(2, "Computing LDEs.", LOG);
}
WritableDoubleDataStore ldes = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null;
- for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
final KNNList<D> neighbors = knnq.getKNNForDBID(it, k);
double sum = 0.0;
int count = 0;
- if (neighbors instanceof DoubleDistanceKNNList) {
+ if(neighbors instanceof DoubleDistanceKNNList) {
// Fast version for double distances
- for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
- if (DBIDUtil.equal(neighbor, it)) {
+ for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
+ if(DBIDUtil.equal(neighbor, it)) {
continue;
}
final double nkdist = ((DoubleDistanceKNNList) knnq.getKNNForDBID(neighbor, k)).doubleKNNDistance();
- if (nkdist > 0.) {
+ if(nkdist > 0.) {
final double v = Math.max(nkdist, neighbor.doubleDistance()) / (h * nkdist);
- sum += kernel.density(v) / Math.pow(h * nkdist, dim);
+ sum += kernel.density(v) / MathUtil.powi(h * nkdist, dim);
count++;
- } else {
+ }
+ else {
sum = Double.POSITIVE_INFINITY;
count++;
break;
}
}
- } else {
- for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
- if (DBIDUtil.equal(neighbor, it)) {
+ }
+ else {
+ for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ if(DBIDUtil.equal(neighbor, it)) {
continue;
}
final double nkdist = knnq.getKNNForDBID(neighbor, k).getKNNDistance().doubleValue();
- if (nkdist > 0.) {
+ if(nkdist > 0.) {
final double v = Math.max(nkdist, neighbor.getDistance().doubleValue()) / (h * nkdist);
- sum += kernel.density(v) / Math.pow(h * nkdist, dim);
+ sum += kernel.density(v) / MathUtil.powi(h * nkdist, dim);
count++;
- } else {
+ }
+ else {
sum = Double.POSITIVE_INFINITY;
count++;
break;
@@ -204,16 +208,16 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
}
}
ldes.putDouble(it, sum / count);
- if (densProgress != null) {
+ if(densProgress != null) {
densProgress.incrementProcessed(LOG);
}
}
- if (densProgress != null) {
+ if(densProgress != null) {
densProgress.ensureCompleted(LOG);
}
// Compute local density factors.
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(3, "Computing LDFs.", LOG);
}
WritableDoubleDataStore ldfs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
@@ -221,14 +225,14 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
DoubleMinMax lofminmax = new DoubleMinMax();
FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("Local Density Factors", ids.size(), LOG) : null;
- for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
final double lrdp = ldes.doubleValue(it);
final KNNList<D> neighbors = knnq.getKNNForDBID(it, k);
double sum = 0.0;
int count = 0;
- for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
// skip the point itself
- if (DBIDUtil.equal(neighbor, it)) {
+ if(DBIDUtil.equal(neighbor, it)) {
continue;
}
sum += ldes.doubleValue(neighbor);
@@ -241,15 +245,15 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
// update minimum and maximum
lofminmax.put(ldf);
- if (progressLOFs != null) {
+ if(progressLOFs != null) {
progressLOFs.incrementProcessed(LOG);
}
}
- if (progressLOFs != null) {
+ if(progressLOFs != null) {
progressLOFs.ensureCompleted(LOG);
}
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.setCompleted(LOG);
}
@@ -327,23 +331,23 @@ public class LDF<O extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
super.makeOptions(config);
final IntParameter pK = new IntParameter(K_ID);
- pK.addConstraint(new GreaterConstraint(1));
- if (config.grab(pK)) {
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(pK)) {
k = pK.getValue();
}
ObjectParameter<KernelDensityFunction> kernelP = new ObjectParameter<>(KERNEL_ID, KernelDensityFunction.class, GaussianKernelDensityFunction.class);
- if (config.grab(kernelP)) {
+ if(config.grab(kernelP)) {
kernel = kernelP.instantiateClass(config);
}
DoubleParameter hP = new DoubleParameter(H_ID);
- if (config.grab(hP)) {
+ if(config.grab(hP)) {
h = hP.doubleValue();
}
DoubleParameter cP = new DoubleParameter(C_ID, 0.1);
- if (config.grab(cP)) {
+ if(config.grab(cP)) {
c = cP.doubleValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java
index 80ed3f68..36c70b48 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LDOF.java
@@ -53,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -80,7 +80,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@Title("LDOF: Local Distance-Based Outlier Factor")
@Description("Local outlier detection appraoch suitable for scattered data by averaging the kNN distance over all k nearest neighbors")
@Reference(authors = "K. Zhang, M. Hutter, H. Jin", title = "A New Local Distance-Based Outlier Detection Approach for Scattered Real-World Data", booktitle = "Proc. 13th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2009), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2_84")
-@Alias({"de.lmu.ifi.dbs.elki.algorithm.outlier.LDOF"})
+@Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.LDOF" })
public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, OutlierResult> implements OutlierAlgorithm {
/**
* The logger for this class.
@@ -138,15 +138,16 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
FiniteProgress progressLDOFs = LOG.isVerbose() ? new FiniteProgress("LDOF_SCORE for objects", relation.size(), LOG) : null;
Mean dxp = new Mean(), Dxp = new Mean();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNList<D> neighbors = knnQuery.getKNNForDBID(iditer, k);
// skip the point itself
- dxp.reset(); Dxp.reset();
+ dxp.reset();
+ Dxp.reset();
// TODO: optimize for double distances
- for (DistanceDBIDListIter<D> neighbor1 = neighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
+ for(DistanceDBIDListIter<D> neighbor1 = neighbors.iter(); neighbor1.valid(); neighbor1.advance()) {
if(!DBIDUtil.equal(neighbor1, iditer)) {
dxp.put(neighbor1.getDistance().doubleValue());
- for (DistanceDBIDListIter<D> neighbor2 = neighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
+ for(DistanceDBIDListIter<D> neighbor2 = neighbors.iter(); neighbor2.valid(); neighbor2.advance()) {
if(!DBIDUtil.equal(neighbor1, neighbor2) && !DBIDUtil.equal(neighbor2, iditer)) {
Dxp.put(distFunc.distance(neighbor1, neighbor2).doubleValue());
}
@@ -199,7 +200,7 @@ public class LDOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBas
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterConstraint(1));
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if(config.grab(kP)) {
k = kP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java
index 302dafe6..28166c75 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LOF.java
@@ -30,6 +30,7 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
@@ -59,7 +60,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -129,8 +130,8 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBase
// "HEAVY" flag for knn query since it is used more than once
KNNQuery<O, D> knnq = database.getKNNQuery(dq, k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
// No optimized kNN query - use a preprocessor!
- if (!(knnq instanceof PreprocessorKNNQuery)) {
- if (stepprog != null) {
+ if(!(knnq instanceof PreprocessorKNNQuery)) {
+ if(stepprog != null) {
stepprog.beginStep(1, "Materializing LOF neighborhoods.", LOG);
}
MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k);
@@ -139,109 +140,131 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBase
DBIDs ids = relation.getDBIDs();
// Compute LRDs
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(2, "Computing LRDs.", LOG);
}
WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
- {
- FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("LRD", ids.size(), LOG) : null;
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k);
- double sum = 0.0;
- int count = 0;
- if (neighbors instanceof DoubleDistanceKNNList) {
- // Fast version for double distances
- for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
- if (DBIDUtil.equal(neighbor, iter)) {
- continue;
- }
- KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k);
- final double nkdist;
- if (neighborsNeighbors instanceof DoubleDistanceKNNList) {
- nkdist = ((DoubleDistanceKNNList) neighborsNeighbors).doubleKNNDistance();
- } else {
- nkdist = neighborsNeighbors.getKNNDistance().doubleValue();
- }
- sum += Math.max(neighbor.doubleDistance(), nkdist);
- count++;
- }
- } else {
- for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
- if (DBIDUtil.equal(neighbor, iter)) {
- continue;
- }
- KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k);
- sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue());
- count++;
- }
- }
- // Avoid division by 0
- final double lrd = (sum > 0) ? (count / sum) : Double.POSITIVE_INFINITY;
- lrds.putDouble(iter, lrd);
- if (lrdsProgress != null) {
- lrdsProgress.incrementProcessed(LOG);
- }
- }
- if (lrdsProgress != null) {
- lrdsProgress.ensureCompleted(LOG);
- }
- }
+ computeLRDs(knnq, ids, lrds);
// compute LOF_SCORE of each db object
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(3, "Computing LOFs.", LOG);
}
- WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
+ DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
+ WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_DB);
// track the maximum value for normalization.
DoubleMinMax lofminmax = new DoubleMinMax();
- {
- FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), LOG) : null;
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- final double lof;
- final double lrdp = lrds.doubleValue(iter);
- final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k);
- if (!Double.isInfinite(lrdp)) {
- double sum = 0.0;
- int count = 0;
- for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
- // skip the point itself
- if (DBIDUtil.equal(neighbor, iter)) {
- continue;
- }
- final double val = lrds.doubleValue(neighbor);
- sum += val;
- count++;
- if (Double.isInfinite(val)) {
- break;
- }
+ computeLOFScores(knnq, ids, lrds, lofs, lofminmax);
+
+ if(stepprog != null) {
+ stepprog.setCompleted(LOG);
+ }
+
+ // Build result representation.
+ Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids);
+ OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
+ return new OutlierResult(scoreMeta, scoreResult);
+ }
+
+ /**
+ * Compute local reachability distances.
+ *
+ * @param knnq KNN query
+ * @param ids IDs to process
+ * @param lrds Reachability storage
+ */
+ private void computeLRDs(KNNQuery<O, D> knnq, DBIDs ids, WritableDoubleDataStore lrds) {
+ FiniteProgress lrdsProgress = LOG.isVerbose() ? new FiniteProgress("LRD", ids.size(), LOG) : null;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k);
+ double sum = 0.0;
+ int count = 0;
+ if(neighbors instanceof DoubleDistanceKNNList) {
+ // Fast version for double distances
+ for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
+ if(DBIDUtil.equal(neighbor, iter)) {
+ continue;
+ }
+ KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k);
+ final double nkdist;
+ if(neighborsNeighbors instanceof DoubleDistanceKNNList) {
+ nkdist = ((DoubleDistanceKNNList) neighborsNeighbors).doubleKNNDistance();
+ }
+ else {
+ nkdist = neighborsNeighbors.getKNNDistance().doubleValue();
}
- lof = sum / (lrdp * count);
- } else {
- lof = 1.0;
+ sum += Math.max(neighbor.doubleDistance(), nkdist);
+ count++;
}
- lofs.putDouble(iter, lof);
- // update minimum and maximum
- lofminmax.put(lof);
-
- if (progressLOFs != null) {
- progressLOFs.incrementProcessed(LOG);
+ }
+ else {
+ for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ if(DBIDUtil.equal(neighbor, iter)) {
+ continue;
+ }
+ KNNList<D> neighborsNeighbors = knnq.getKNNForDBID(neighbor, k);
+ sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.getKNNDistance().doubleValue());
+ count++;
}
}
- if (progressLOFs != null) {
- progressLOFs.ensureCompleted(LOG);
+ // Avoid division by 0
+ final double lrd = (sum > 0) ? (count / sum) : Double.POSITIVE_INFINITY;
+ lrds.putDouble(iter, lrd);
+ if(lrdsProgress != null) {
+ lrdsProgress.incrementProcessed(LOG);
}
}
-
- if (stepprog != null) {
- stepprog.setCompleted(LOG);
+ if(lrdsProgress != null) {
+ lrdsProgress.ensureCompleted(LOG);
}
+ }
- // Build result representation.
- Relation<Double> scoreResult = new MaterializedRelation<>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, ids);
- OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
- OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
+ /**
+ * Compute local outlier factors.
+ *
+ * @param knnq KNN query
+ * @param ids IDs to process
+ * @param lrds Local reachability distances
+ * @param lofs Local outlier factor storage
+ * @param lofminmax Score minimum/maximum tracker
+ */
+ private void computeLOFScores(KNNQuery<O, D> knnq, DBIDs ids, DoubleDataStore lrds, WritableDoubleDataStore lofs, DoubleMinMax lofminmax) {
+ FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), LOG) : null;
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ final double lof;
+ final double lrdp = lrds.doubleValue(iter);
+ final KNNList<D> neighbors = knnq.getKNNForDBID(iter, k);
+ if(!Double.isInfinite(lrdp)) {
+ double sum = 0.0;
+ int count = 0;
+ for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ // skip the point itself
+ if(DBIDUtil.equal(neighbor, iter)) {
+ continue;
+ }
+ final double val = lrds.doubleValue(neighbor);
+ sum += val;
+ count++;
+ if(Double.isInfinite(val)) {
+ break;
+ }
+ }
+ lof = sum / (lrdp * count);
+ }
+ else {
+ lof = 1.0;
+ }
+ lofs.putDouble(iter, lof);
+ // update minimum and maximum
+ lofminmax.put(lof);
- return result;
+ if(progressLOFs != null) {
+ progressLOFs.incrementProcessed(LOG);
+ }
+ }
+ if(progressLOFs != null) {
+ progressLOFs.ensureCompleted(LOG);
+ }
}
@Override
@@ -279,8 +302,8 @@ public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBase
super.makeOptions(config);
final IntParameter pK = new IntParameter(K_ID);
- pK.addConstraint(new GreaterConstraint(1));
- if (config.grab(pK)) {
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(pK)) {
k = pK.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java
index 15ff690a..525d45f2 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/LoOP.java
@@ -64,7 +64,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -183,26 +183,28 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
protected Pair<KNNQuery<O, D>, KNNQuery<O, D>> getKNNQueries(Database database, Relation<O> relation, StepProgress stepprog) {
KNNQuery<O, D> knnComp;
KNNQuery<O, D> knnReach;
- if (comparisonDistanceFunction == reachabilityDistanceFunction || comparisonDistanceFunction.equals(reachabilityDistanceFunction)) {
+ if(comparisonDistanceFunction == reachabilityDistanceFunction || comparisonDistanceFunction.equals(reachabilityDistanceFunction)) {
// We need each neighborhood twice - use "HEAVY" flag.
knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, Math.max(kreach, kcomp), DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
// No optimized kNN query - use a preprocessor!
- if (knnComp == null) {
- if (stepprog != null) {
+ if(knnComp == null) {
+ if(stepprog != null) {
stepprog.beginStep(1, "Materializing neighborhoods with respect to reference neighborhood distance function.", LOG);
}
MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, comparisonDistanceFunction, kcomp);
database.addIndex(preproc);
DistanceQuery<O, D> cdq = database.getDistanceQuery(relation, comparisonDistanceFunction);
knnComp = preproc.getKNNQuery(cdq, kreach, DatabaseQuery.HINT_HEAVY_USE);
- } else {
- if (stepprog != null) {
+ }
+ else {
+ if(stepprog != null) {
stepprog.beginStep(1, "Optimized neighborhoods provided by database.", LOG);
}
}
knnReach = knnComp;
- } else {
- if (stepprog != null) {
+ }
+ else {
+ if(stepprog != null) {
stepprog.beginStep(1, "Not materializing distance functions, since we request each DBID once only.", LOG);
}
knnComp = QueryUtil.getKNNQuery(relation, comparisonDistanceFunction, kreach);
@@ -228,10 +230,10 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
KNNQuery<O, D> knnReach = pair.getSecond();
// Assert we got something
- if (knnComp == null) {
+ if(knnComp == null) {
throw new AbortException("No kNN queries supported by database for comparison distance function.");
}
- if (knnReach == null) {
+ if(knnReach == null) {
throw new AbortException("No kNN queries supported by database for density estimation distance function.");
}
@@ -239,34 +241,35 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
WritableDoubleDataStore pdists = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
Mean mean = new Mean();
{// computing PRDs
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(3, "Computing pdists", LOG);
}
FiniteProgress prdsProgress = LOG.isVerbose() ? new FiniteProgress("pdists", relation.size(), LOG) : null;
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
final KNNList<D> neighbors = knnReach.getKNNForDBID(iditer, kreach);
mean.reset();
// use first kref neighbors as reference set
int ks = 0;
// TODO: optimize for double distances
- if (neighbors instanceof DoubleDistanceKNNList) {
- for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
- if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) {
+ if(neighbors instanceof DoubleDistanceKNNList) {
+ for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
+ if(objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) {
final double d = neighbor.doubleDistance();
mean.put(d * d);
ks++;
- if (ks >= kreach) {
+ if(ks >= kreach) {
break;
}
}
}
- } else {
- for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
- if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) {
+ }
+ else {
+ for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ if(objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) {
double d = neighbor.getDistance().doubleValue();
mean.put(d * d);
ks++;
- if (ks >= kreach) {
+ if(ks >= kreach) {
break;
}
}
@@ -274,7 +277,7 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
}
double pdist = lambda * Math.sqrt(mean.getMean());
pdists.putDouble(iditer, pdist);
- if (prdsProgress != null) {
+ if(prdsProgress != null) {
prdsProgress.incrementProcessed(LOG);
}
}
@@ -283,62 +286,62 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
WritableDoubleDataStore plofs = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
MeanVariance mvplof = new MeanVariance();
{// compute LOOP_SCORE of each db object
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(4, "Computing PLOF", LOG);
}
FiniteProgress progressPLOFs = LOG.isVerbose() ? new FiniteProgress("PLOFs for objects", relation.size(), LOG) : null;
MeanVariance mv = new MeanVariance();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
final KNNList<D> neighbors = knnComp.getKNNForDBID(iditer, kcomp);
mv.reset();
// use first kref neighbors as comparison set.
int ks = 0;
- for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
- if (objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) {
+ for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ if(objectIsInKNN || !DBIDUtil.equal(neighbor, iditer)) {
mv.put(pdists.doubleValue(neighbor));
ks++;
- if (ks >= kcomp) {
+ if(ks >= kcomp) {
break;
}
}
}
double plof = Math.max(pdists.doubleValue(iditer) / mv.getMean(), 1.0);
- if (Double.isNaN(plof) || Double.isInfinite(plof)) {
+ if(Double.isNaN(plof) || Double.isInfinite(plof)) {
plof = 1.0;
}
plofs.putDouble(iditer, plof);
mvplof.put((plof - 1.0) * (plof - 1.0));
- if (progressPLOFs != null) {
+ if(progressPLOFs != null) {
progressPLOFs.incrementProcessed(LOG);
}
}
}
double nplof = lambda * Math.sqrt(mvplof.getMean());
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.verbose("nplof normalization factor is " + nplof + " " + mvplof.getMean() + " " + mvplof.getSampleStddev());
}
// Compute final LoOP values.
WritableDoubleDataStore loops = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
{// compute LOOP_SCORE of each db object
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(5, "Computing LoOP scores", LOG);
}
FiniteProgress progressLOOPs = LOG.isVerbose() ? new FiniteProgress("LoOP for objects", relation.size(), LOG) : null;
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
loops.putDouble(iditer, NormalDistribution.erf((plofs.doubleValue(iditer) - 1) / (nplof * sqrt2)));
- if (progressLOOPs != null) {
+ if(progressLOOPs != null) {
progressLOOPs.incrementProcessed(LOG);
}
}
}
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.setCompleted(LOG);
}
@@ -351,9 +354,10 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
@Override
public TypeInformation[] getInputTypeRestriction() {
final TypeInformation type;
- if (reachabilityDistanceFunction.equals(comparisonDistanceFunction)) {
+ if(reachabilityDistanceFunction.equals(comparisonDistanceFunction)) {
type = reachabilityDistanceFunction.getInputTypeRestriction();
- } else {
+ }
+ else {
type = new CombinedTypeInformation(reachabilityDistanceFunction.getInputTypeRestriction(), comparisonDistanceFunction.getInputTypeRestriction());
}
return TypeUtil.array(type);
@@ -401,34 +405,35 @@ public class LoOP<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<O
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kcompP = new IntParameter(KCOMP_ID);
- kcompP.addConstraint(new GreaterConstraint(1));
- if (config.grab(kcompP)) {
+ kcompP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(kcompP)) {
kcomp = kcompP.intValue();
}
final ObjectParameter<DistanceFunction<O, D>> compDistP = new ObjectParameter<>(COMPARISON_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
- if (config.grab(compDistP)) {
+ if(config.grab(compDistP)) {
comparisonDistanceFunction = compDistP.instantiateClass(config);
}
final IntParameter kreachP = new IntParameter(KREACH_ID);
- kreachP.addConstraint(new GreaterConstraint(1));
+ kreachP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
kreachP.setOptional(true);
- if (config.grab(kreachP)) {
+ if(config.grab(kreachP)) {
kreach = kreachP.intValue();
- } else {
+ }
+ else {
kreach = kcomp;
}
final ObjectParameter<DistanceFunction<O, D>> reachDistP = new ObjectParameter<>(REACHABILITY_DISTANCE_FUNCTION_ID, DistanceFunction.class, true);
- if (config.grab(reachDistP)) {
+ if(config.grab(reachDistP)) {
reachabilityDistanceFunction = reachDistP.instantiateClass(config);
}
// TODO: make default 1.0?
final DoubleParameter lambdaP = new DoubleParameter(LAMBDA_ID, 2.0);
- lambdaP.addConstraint(new GreaterConstraint(0.0));
- if (config.grab(lambdaP)) {
+ lambdaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(lambdaP)) {
lambda = lambdaP.doubleValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java
index 2ff7534a..b990ef35 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimpleKernelDensityLOF.java
@@ -55,13 +55,14 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction;
import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -151,7 +152,7 @@ public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberD
}
double max = ((DoubleDistanceKNNList)knnq.getKNNForDBID(neighbor, k)).doubleKNNDistance();
final double v = neighbor.doubleDistance() / max;
- sum += kernel.density(v) / Math.pow(max, dim);
+ sum += kernel.density(v) / MathUtil.powi(max, dim);
count++;
}
} else {
@@ -161,7 +162,7 @@ public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberD
}
double max = knnq.getKNNForDBID(neighbor, k).getKNNDistance().doubleValue();
final double v = neighbor.getDistance().doubleValue() / max;
- sum += kernel.density(v) / Math.pow(max, dim);
+ sum += kernel.density(v) / MathUtil.powi(max, dim);
count++;
}
}
@@ -268,7 +269,7 @@ public class SimpleKernelDensityLOF<O extends NumberVector<?>, D extends NumberD
super.makeOptions(config);
final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID);
- pK.addConstraint(new GreaterConstraint(1));
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
if (config.grab(pK)) {
k = pK.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java
index 413eaca1..d54b053f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/lof/SimplifiedLOF.java
@@ -57,7 +57,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.Alias;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -118,8 +118,8 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi
// "HEAVY" flag for KNN Query since it is used more than once
KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
// No optimized kNN query - use a preprocessor!
- if (!(knnq instanceof PreprocessorKNNQuery)) {
- if (stepprog != null) {
+ if(!(knnq instanceof PreprocessorKNNQuery)) {
+ if(stepprog != null) {
stepprog.beginStep(1, "Materializing neighborhoods w.r.t. distance function.", LOG);
}
MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<>(relation, getDistanceFunction(), k);
@@ -129,27 +129,28 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi
}
// Compute LRDs
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(2, "Computing densities.", LOG);
}
WritableDoubleDataStore dens = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
FiniteProgress densProgress = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null;
- for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
final KNNList<D> neighbors = knnq.getKNNForDBID(it, k);
double sum = 0.0;
int count = 0;
- if (neighbors instanceof DoubleDistanceKNNList) {
+ if(neighbors instanceof DoubleDistanceKNNList) {
// Fast version for double distances
- for (DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
- if (DBIDUtil.equal(neighbor, it)) {
+ for(DoubleDistanceDBIDListIter neighbor = ((DoubleDistanceKNNList) neighbors).iter(); neighbor.valid(); neighbor.advance()) {
+ if(DBIDUtil.equal(neighbor, it)) {
continue;
}
sum += neighbor.doubleDistance();
count++;
}
- } else {
- for (DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
- if (DBIDUtil.equal(neighbor, it)) {
+ }
+ else {
+ for(DistanceDBIDListIter<D> neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ if(DBIDUtil.equal(neighbor, it)) {
continue;
}
sum += neighbor.getDistance().doubleValue();
@@ -159,16 +160,16 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi
// Avoid division by 0
final double lrd = (sum > 0) ? (count / sum) : 0;
dens.putDouble(it, lrd);
- if (densProgress != null) {
+ if(densProgress != null) {
densProgress.incrementProcessed(LOG);
}
}
- if (densProgress != null) {
+ if(densProgress != null) {
densProgress.ensureCompleted(LOG);
}
// compute LOF_SCORE of each db object
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(3, "Computing SLOFs.", LOG);
}
WritableDoubleDataStore lofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
@@ -176,38 +177,39 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi
DoubleMinMax lofminmax = new DoubleMinMax();
FiniteProgress progressLOFs = LOG.isVerbose() ? new FiniteProgress("Simple LOF scores.", ids.size(), LOG) : null;
- for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
+ for(DBIDIter it = ids.iter(); it.valid(); it.advance()) {
final double lrdp = dens.doubleValue(it);
final double lof;
- if (lrdp > 0) {
+ if(lrdp > 0) {
final KNNList<D> neighbors = knnq.getKNNForDBID(it, k);
double sum = 0.0;
int count = 0;
- for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
+ for(DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
// skip the point itself
- if (DBIDUtil.equal(neighbor, it)) {
+ if(DBIDUtil.equal(neighbor, it)) {
continue;
}
sum += dens.doubleValue(neighbor);
count++;
}
lof = sum / (count * lrdp);
- } else {
+ }
+ else {
lof = 1.0;
}
lofs.putDouble(it, lof);
// update minimum and maximum
lofminmax.put(lof);
- if (progressLOFs != null) {
+ if(progressLOFs != null) {
progressLOFs.incrementProcessed(LOG);
}
}
- if (progressLOFs != null) {
+ if(progressLOFs != null) {
progressLOFs.ensureCompleted(LOG);
}
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.setCompleted(LOG);
}
@@ -250,8 +252,8 @@ public class SimplifiedLOF<O, D extends NumberDistance<D, ?>> extends AbstractDi
super.makeOptions(config);
final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID);
- pK.addConstraint(new GreaterConstraint(1));
- if (config.grab(pK)) {
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(pK)) {
k = pK.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java
index 0d0f7303..757b80ad 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/ExternalDoubleOutlierScore.java
@@ -52,6 +52,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.FileUtil;
+import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
@@ -175,7 +176,7 @@ public class ExternalDoubleOutlierScore extends AbstractAlgorithm<OutlierResult>
if(!Double.isNaN(score)) {
throw new AbortException("Score pattern matched twice: previous value " + score + " second value: " + str);
}
- score = Double.parseDouble(str.substring(ms.end()));
+ score = FormatUtil.parseDouble(str.substring(ms.end()));
}
}
if(id != null && !Double.isNaN(score)) {
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java
index 22c20fc3..5b681106 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/FeatureBagging.java
@@ -54,8 +54,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -136,12 +135,12 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements
final int dbdim = RelationUtil.dimensionality(relation);
final int mindim = dbdim >> 1;
final int maxdim = dbdim - 1;
- final Random rand = rnd.getRandom();
+ final Random rand = rnd.getSingleThreadedRandom();
ArrayList<OutlierResult> results = new ArrayList<>(num);
{
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("LOF iterations", num, LOG) : null;
- for (int i = 0; i < num; i++) {
+ for(int i = 0; i < num; i++) {
BitSet dimset = randomSubspace(dbdim, mindim, maxdim, rand);
SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(dimset);
LOF<NumberVector<?>, DoubleDistance> lof = new LOF<>(k, df);
@@ -149,18 +148,18 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements
// run LOF and collect the result
OutlierResult result = lof.run(database, relation);
results.add(result);
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
}
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
DoubleMinMax minmax = new DoubleMinMax();
- if (breadth) {
+ if(breadth) {
FiniteProgress cprog = LOG.isVerbose() ? new FiniteProgress("Combining results", relation.size(), LOG) : null;
Pair<DBIDIter, Relation<Double>>[] IDVectorOntoScoreVector = Pair.newPairArray(results.size());
@@ -168,55 +167,57 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements
// We need to initialize them now be able to iterate them "in parallel".
{
int i = 0;
- for (OutlierResult r : results) {
+ for(OutlierResult r : results) {
IDVectorOntoScoreVector[i] = new Pair<DBIDIter, Relation<Double>>(r.getOrdering().iter(relation.getDBIDs()).iter(), r.getScores());
i++;
}
}
// Iterating over the *lines* of the AS_t(i)-matrix.
- for (int i = 0; i < relation.size(); i++) {
+ for(int i = 0; i < relation.size(); i++) {
// Iterating over the elements of a line (breadth-first).
- for (Pair<DBIDIter, Relation<Double>> pair : IDVectorOntoScoreVector) {
+ for(Pair<DBIDIter, Relation<Double>> pair : IDVectorOntoScoreVector) {
DBIDIter iter = pair.first;
// Always true if every algorithm returns a complete result (one score
// for every DBID).
- if (iter.valid()) {
+ if(iter.valid()) {
double score = pair.second.get(iter);
- if (Double.isNaN(scores.doubleValue(iter))) {
+ if(Double.isNaN(scores.doubleValue(iter))) {
scores.putDouble(iter, score);
minmax.put(score);
}
iter.advance();
- } else {
+ }
+ else {
LOG.warning("Incomplete result: Iterator does not contain |DB| DBIDs");
}
}
// Progress does not take the initial mapping into account.
- if (cprog != null) {
+ if(cprog != null) {
cprog.incrementProcessed(LOG);
}
}
- if (cprog != null) {
+ if(cprog != null) {
cprog.ensureCompleted(LOG);
}
- } else {
+ }
+ else {
FiniteProgress cprog = LOG.isVerbose() ? new FiniteProgress("Combining results", relation.size(), LOG) : null;
- for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
double sum = 0.0;
- for (OutlierResult r : results) {
+ for(OutlierResult r : results) {
final Double s = r.getScores().get(iter);
- if (s != null && !Double.isNaN(s)) {
+ if(s != null && !Double.isNaN(s)) {
sum += s;
}
}
scores.putDouble(iter, sum);
minmax.put(sum);
- if (cprog != null) {
+ if(cprog != null) {
cprog.incrementProcessed(LOG);
}
}
- if (cprog != null) {
+ if(cprog != null) {
cprog.ensureCompleted(LOG);
}
}
@@ -237,13 +238,13 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements
BitSet dimset = new BitSet();
// Fill with all dimensions
int[] dims = new int[alldim];
- for (int d = 0; d < alldim; d++) {
+ for(int d = 0; d < alldim; d++) {
dims[d] = d;
}
// Target dimensionality:
int subdim = mindim + rand.nextInt(maxdim - mindim);
// Shrink the subspace to the destination size
- for (int d = 0; d < alldim - subdim; d++) {
+ for(int d = 0; d < alldim - subdim; d++) {
int s = rand.nextInt(alldim - d);
dimset.set(dims[s]);
dims[s] = dims[alldim - d - 1];
@@ -317,21 +318,21 @@ public class FeatureBagging extends AbstractAlgorithm<OutlierResult> implements
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter pK = new IntParameter(LOF.Parameterizer.K_ID);
- pK.addConstraint(new GreaterConstraint(1));
- if (config.grab(pK)) {
+ pK.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(pK)) {
k = pK.getValue();
}
IntParameter numP = new IntParameter(NUM_ID);
- numP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(numP)) {
+ numP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(numP)) {
num = numP.getValue();
}
Flag breadthF = new Flag(BREADTH_ID);
- if (config.grab(breadthF)) {
+ if(config.grab(breadthF)) {
breadth = breadthF.getValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java
index 69608293..f92a8b80 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/meta/HiCS.java
@@ -72,7 +72,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -174,9 +174,9 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
final DBIDs ids = relation.getDBIDs();
ArrayList<ArrayDBIDs> subspaceIndex = buildOneDimIndexes(relation);
- Set<HiCSSubspace> subspaces = calculateSubspaces(relation, subspaceIndex, rnd.getRandom());
+ Set<HiCSSubspace> subspaces = calculateSubspaces(relation, subspaceIndex, rnd.getSingleThreadedRandom());
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Number of high-contrast subspaces: " + subspaces.size());
}
List<Relation<Double>> results = new ArrayList<>();
@@ -185,8 +185,8 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
// run outlier detection and collect the result
// TODO extend so that any outlierAlgorithm can be used (use materialized
// relation instead of SubspaceEuclideanDistanceFunction?)
- for (HiCSSubspace dimset : subspaces) {
- if (LOG.isVerbose()) {
+ for(HiCSSubspace dimset : subspaces) {
+ if(LOG.isVerbose()) {
LOG.verbose("Performing outlier detection in subspace " + dimset);
}
@@ -196,22 +196,22 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
// run LOF and collect the result
OutlierResult result = outlierAlgorithm.run(pdb);
results.add(result.getScores());
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
DoubleMinMax minmax = new DoubleMinMax();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double sum = 0.0;
- for (Relation<Double> r : results) {
+ for(Relation<Double> r : results) {
final Double s = r.get(iditer);
- if (s != null && !Double.isNaN(s)) {
+ if(s != null && !Double.isNaN(s)) {
sum += s;
}
}
@@ -237,7 +237,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
ArrayList<ArrayDBIDs> subspaceIndex = new ArrayList<>(dim + 1);
SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
- for (int i = 0; i < dim; i++) {
+ for(int i = 0; i < dim; i++) {
ArrayModifiableDBIDs amDBIDs = DBIDUtil.newArray(relation.getDBIDs());
comp.setDimension(i);
amDBIDs.sort(comp);
@@ -258,7 +258,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
final int dbdim = RelationUtil.dimensionality(relation);
FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Subspace dimensionality", dbdim, LOG) : null;
- if (dprog != null) {
+ if(dprog != null) {
dprog.setProcessed(2, LOG);
}
@@ -266,31 +266,31 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
TopBoundedHeap<HiCSSubspace> dDimensionalList = new TopBoundedHeap<>(cutoff, HiCSSubspace.SORT_BY_CONTRAST_ASC);
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Generating two-element subsets", (dbdim * (dbdim - 1)) >> 1, LOG) : null;
// compute two-element sets of subspaces
- for (int i = 0; i < dbdim; i++) {
- for (int j = i + 1; j < dbdim; j++) {
+ for(int i = 0; i < dbdim; i++) {
+ for(int j = i + 1; j < dbdim; j++) {
HiCSSubspace ts = new HiCSSubspace();
ts.set(i);
ts.set(j);
calculateContrast(relation, ts, subspaceIndex, random);
dDimensionalList.add(ts);
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
IndefiniteProgress qprog = LOG.isVerbose() ? new IndefiniteProgress("Testing subspace candidates", LOG) : null;
- for (int d = 3; !dDimensionalList.isEmpty(); d++) {
- if (dprog != null) {
+ for(int d = 3; !dDimensionalList.isEmpty(); d++) {
+ if(dprog != null) {
dprog.setProcessed(d, LOG);
}
// result now contains all d-dimensional sets of subspaces
ArrayList<HiCSSubspace> candidateList = new ArrayList<>(dDimensionalList.size());
- for (Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
+ for(Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
subspaceList.add(it.get());
candidateList.add(it.get());
}
@@ -299,39 +299,39 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
Collections.sort(candidateList, HiCSSubspace.SORT_BY_SUBSPACE);
// TODO: optimize APRIORI style, by not even computing the bit set or?
- for (int i = 0; i < candidateList.size() - 1; i++) {
- for (int j = i + 1; j < candidateList.size(); j++) {
+ for(int i = 0; i < candidateList.size() - 1; i++) {
+ for(int j = i + 1; j < candidateList.size(); j++) {
HiCSSubspace set1 = candidateList.get(i);
HiCSSubspace set2 = candidateList.get(j);
HiCSSubspace joinedSet = new HiCSSubspace();
joinedSet.or(set1);
joinedSet.or(set2);
- if (joinedSet.cardinality() != d) {
+ if(joinedSet.cardinality() != d) {
continue;
}
calculateContrast(relation, joinedSet, subspaceIndex, random);
dDimensionalList.add(joinedSet);
- if (qprog != null) {
+ if(qprog != null) {
qprog.incrementProcessed(LOG);
}
}
}
// Prune
- for (HiCSSubspace cand : candidateList) {
- for (Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
- if (it.get().contrast > cand.contrast) {
+ for(HiCSSubspace cand : candidateList) {
+ for(Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
+ if(it.get().contrast > cand.contrast) {
subspaceList.remove(cand);
break;
}
}
}
}
- if (qprog != null) {
+ if(qprog != null) {
qprog.setCompleted(LOG);
}
- if (dprog != null) {
+ if(dprog != null) {
dprog.setProcessed(dbdim, LOG);
dprog.ensureCompleted(LOG);
}
@@ -353,17 +353,17 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
int retries = 0;
double deviationSum = 0.0;
- for (int i = 0; i < m; i++) {
+ for(int i = 0; i < m; i++) {
// Choose a random set bit.
int chosen = -1;
- for (int tmp = random.nextInt(card); tmp >= 0; tmp--) {
+ for(int tmp = random.nextInt(card); tmp >= 0; tmp--) {
chosen = subspace.nextSetBit(chosen + 1);
}
// initialize sample
DBIDs conditionalSample = relation.getDBIDs();
- for (int j = subspace.nextSetBit(0); j >= 0; j = subspace.nextSetBit(j + 1)) {
- if (j == chosen) {
+ for(int j = subspace.nextSetBit(0); j >= 0; j = subspace.nextSetBit(j + 1)) {
+ if(j == chosen) {
continue;
}
ArrayDBIDs sortedIndices = subspaceIndex.get(j);
@@ -371,20 +371,21 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
// initialize index block
DBIDArrayIter iter = sortedIndices.iter();
iter.seek(random.nextInt(relation.size() - windowsize));
- for (int k = 0; k < windowsize; k++, iter.advance()) {
+ for(int k = 0; k < windowsize; k++, iter.advance()) {
indexBlock.add(iter); // select index block
}
conditionalSample = DBIDUtil.intersection(conditionalSample, indexBlock);
}
- if (conditionalSample.size() < 10) {
+ if(conditionalSample.size() < 10) {
retries++;
- if (LOG.isDebugging()) {
+ if(LOG.isDebugging()) {
LOG.debug("Sample size very small. Retry no. " + retries);
}
- if (retries >= MAX_RETRIES) {
+ if(retries >= MAX_RETRIES) {
LOG.warning("Too many retries, for small samples: " + retries);
- } else {
+ }
+ else {
i--;
continue;
}
@@ -393,7 +394,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
double[] sampleValues = new double[conditionalSample.size()];
{
int l = 0;
- for (DBIDIter iter = conditionalSample.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = conditionalSample.iter(); iter.valid(); iter.advance()) {
sampleValues[l] = relation.get(iter).doubleValue(chosen);
l++;
}
@@ -402,23 +403,23 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
double[] fullValues = new double[relation.size()];
{
int l = 0;
- for (DBIDIter iter = subspaceIndex.get(chosen).iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = subspaceIndex.get(chosen).iter(); iter.valid(); iter.advance()) {
fullValues[l] = relation.get(iter).doubleValue(chosen);
l++;
}
}
double contrast = statTest.deviation(fullValues, sampleValues);
- if (Double.isNaN(contrast)) {
+ if(Double.isNaN(contrast)) {
i--;
LOG.warning("Contrast was NaN");
continue;
}
deviationSum += contrast;
- if (prog != null) {
+ if(prog != null) {
prog.incrementProcessed(LOG);
}
}
- if (prog != null) {
+ if(prog != null) {
prog.ensureCompleted(LOG);
}
subspace.contrast = deviationSum / m;
@@ -464,7 +465,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("[contrast=").append(contrast);
- for (int i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
+ for(int i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
buf.append(' ').append(i + 1);
}
buf.append(']');
@@ -477,7 +478,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
public static final Comparator<HiCSSubspace> SORT_BY_CONTRAST_ASC = new Comparator<HiCSSubspace>() {
@Override
public int compare(HiCSSubspace o1, HiCSSubspace o2) {
- if (o1.contrast == o2.contrast) {
+ if(o1.contrast == o2.contrast) {
return 0;
}
return o1.contrast > o2.contrast ? 1 : -1;
@@ -490,7 +491,7 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
public static final Comparator<HiCSSubspace> SORT_BY_CONTRAST_DESC = new Comparator<HiCSSubspace>() {
@Override
public int compare(HiCSSubspace o1, HiCSSubspace o2) {
- if (o1.contrast == o2.contrast) {
+ if(o1.contrast == o2.contrast) {
return 0;
}
return o1.contrast < o2.contrast ? 1 : -1;
@@ -505,10 +506,11 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
public int compare(HiCSSubspace o1, HiCSSubspace o2) {
int dim1 = o1.nextSetBit(0);
int dim2 = o2.nextSetBit(0);
- while (dim1 >= 0 && dim2 >= 0) {
- if (dim1 < dim2) {
+ while(dim1 >= 0 && dim2 >= 0) {
+ if(dim1 < dim2) {
return -1;
- } else if (dim1 > dim2) {
+ }
+ else if(dim1 > dim2) {
return 1;
}
dim1 = o1.nextSetBit(dim1 + 1);
@@ -597,35 +599,35 @@ public class HiCS<V extends NumberVector<?>> extends AbstractAlgorithm<OutlierRe
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter mP = new IntParameter(M_ID, 50);
- mP.addConstraint(new GreaterConstraint(1));
- if (config.grab(mP)) {
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(mP)) {
m = mP.intValue();
}
final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.1);
- alphaP.addConstraint(new GreaterConstraint(0));
- if (config.grab(alphaP)) {
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
final ObjectParameter<OutlierAlgorithm> algoP = new ObjectParameter<>(ALGO_ID, OutlierAlgorithm.class, LOF.class);
- if (config.grab(algoP)) {
+ if(config.grab(algoP)) {
outlierAlgorithm = algoP.instantiateClass(config);
}
final ObjectParameter<GoodnessOfFitTest> testP = new ObjectParameter<>(TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class);
- if (config.grab(testP)) {
+ if(config.grab(testP)) {
statTest = testP.instantiateClass(config);
}
final IntParameter cutoffP = new IntParameter(LIMIT_ID, 100);
- cutoffP.addConstraint(new GreaterConstraint(1));
- if (config.grab(cutoffP)) {
+ cutoffP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(cutoffP)) {
cutoff = cutoffP.intValue();
}
final RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java
index c8efe4da..85524b4e 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/CTLuRandomWalkEC.java
@@ -56,7 +56,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -149,7 +149,7 @@ public class CTLuRandomWalkEC<N, D extends NumberDistance<D, ?>> extends Abstrac
}
final double e;
final D distance = distFunc.distance(id, n);
- heap.add(distance, n);
+ heap.insert(distance, n);
double dist = distance.doubleValue();
if(dist == 0) {
LOG.warning("Zero distances are not supported - skipping: " + DBIDUtil.toString(id) + " " + DBIDUtil.toString(n));
@@ -296,7 +296,7 @@ public class CTLuRandomWalkEC<N, D extends NumberDistance<D, ?>> extends Abstrac
*/
protected void configK(Parameterization config) {
final IntParameter param = new IntParameter(K_ID);
- param.addConstraint(new GreaterEqualConstraint(1));
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(param)) {
k = param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java
index e07ce480..1a1f9a82 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/TrimmedMeanApproach.java
@@ -1,26 +1,27 @@
package de.lmu.ifi.dbs.elki.algorithm.outlier.spatial;
-/*
-This file is part of ELKI:
-Environment for Developing KDD-Applications Supported by Index-Structures
-
-Copyright (C) 2013
-Ludwig-Maximilians-Universität München
-Lehr- und Forschungseinheit für Datenbanksysteme
-ELKI Development Team
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2013
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
import java.util.Arrays;
@@ -50,15 +51,15 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
/**
* A Trimmed Mean Approach to Finding Spatial Outliers.
*
- * Outliers are defined by their value deviation from a trimmed mean of the neighbors.
+ * Outliers are defined by their value deviation from a trimmed mean of the
+ * neighbors.
*
* <p>
* Reference: <br>
@@ -116,7 +117,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> {
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing trimmed means", relation.size(), LOG) : null;
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
DBIDs neighbors = npred.getNeighborDBIDs(iditer);
int num = 0;
double[] values = new double[neighbors.size()];
@@ -161,7 +162,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> {
double[] ei = new double[relation.size()];
{
int i = 0;
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
ei[i] = errors.doubleValue(iditer);
i++;
}
@@ -180,7 +181,7 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> {
}
// calculate score
DoubleMinMax minmax = new DoubleMinMax();
- for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = Math.abs(errors.doubleValue(iditer)) * 0.6745 / median_dev_from_median;
scores.putDouble(iditer, score);
minmax.put(score);
@@ -228,8 +229,8 @@ public class TrimmedMeanApproach<N> extends AbstractNeighborhoodOutlier<N> {
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter pP = new DoubleParameter(P_ID);
- pP.addConstraint(new GreaterConstraint(0.0));
- pP.addConstraint(new LessConstraint(0.5));
+ pP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ pP.addConstraint(CommonConstraints.LESS_THAN_HALF_DOUBLE);
if(config.grab(pP)) {
p = pP.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java
index c4fc4407..c93b10cb 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExtendedNeighborhood.java
@@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -224,7 +224,7 @@ public class ExtendedNeighborhood extends AbstractPrecomputedNeighborhood {
*/
public static int getParameterSteps(Parameterization config) {
final IntParameter param = new IntParameter(STEPS_ID);
- param.addConstraint(new GreaterEqualConstraint(1));
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(param)) {
return param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java
index 96896bd8..33b5010a 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/ExternalNeighborhood.java
@@ -161,8 +161,8 @@ public class ExternalNeighborhood extends AbstractPrecomputedNeighborhood {
if(olq != null) {
LabelList label = olq.get(iditer);
if(label != null) {
- for(String lbl : label) {
- lblmap.put(lbl, DBIDUtil.deref(iditer));
+ for(int i = 0; i < label.size(); i++) {
+ lblmap.put(label.get(i), DBIDUtil.deref(iditer));
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java
index 05bf2f18..4d6ec635 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/spatial/neighborhood/weighted/LinearWeightedExtendedNeighborhood.java
@@ -38,7 +38,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -216,7 +216,7 @@ public class LinearWeightedExtendedNeighborhood implements WeightedNeighborSetPr
*/
public static int getParameterSteps(Parameterization config) {
final IntParameter param = new IntParameter(STEPS_ID);
- param.addConstraint(new GreaterEqualConstraint(1));
+ param.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(param)) {
return param.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java
index ae04fef4..c21542da 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OUTRES.java
@@ -55,6 +55,7 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution;
import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction;
@@ -368,7 +369,7 @@ public class OUTRES<V extends NumberVector<?>> extends AbstractAlgorithm<Outlier
*/
protected double optimalBandwidth(int dim) {
// Pi in the publication is redundant and cancels out!
- double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * Math.pow(2, dim);
+ double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * MathUtil.powi(2, dim);
return hopt * Math.pow(relation.size(), (-1. / (dim + 4)));
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java
index 96c8875f..3e248bfa 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/OutRankS1.java
@@ -49,7 +49,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -186,7 +186,7 @@ public class OutRankS1 extends AbstractAlgorithm<OutlierResult> implements Outli
algorithm = algP.instantiateClass(config);
}
DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 0.25);
- alphaP.addConstraint(new GreaterConstraint(0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
if (config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
index b2255e67..489f811b 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/subspace/SOD.java
@@ -31,10 +31,10 @@ import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
-import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
@@ -44,7 +44,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction;
@@ -52,7 +51,9 @@ import de.lmu.ifi.dbs.elki.distance.similarityfunction.SimilarityFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
-import de.lmu.ifi.dbs.elki.result.ResultHierarchy;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
@@ -65,9 +66,10 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
@@ -91,7 +93,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @param <V> the type of NumberVector handled by this Algorithm
* @param <D> distance type
*/
-// todo arthur comment
@Title("SOD: Subspace outlier degree")
@Description("Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data")
@Reference(authors = "H.-P. Kriegel, P. Kröger, E. Schubert, A. Zimek", title = "Outlier Detection in Axis-Parallel Subspaces of High Dimensional Data", booktitle = "Proceedings of the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Bangkok, Thailand, 2009", url = "http://dx.doi.org/10.1007/978-3-642-01307-2")
@@ -102,50 +103,39 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
private static final Logging LOG = Logging.getLogger(SOD.class);
/**
- * Parameter to specify the number of shared nearest neighbors to be
- * considered for learning the subspace properties., must be an integer
- * greater than 0.
- */
- public static final OptionID KNN_ID = new OptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties.");
-
- /**
- * Parameter to indicate the multiplier for the discriminance value for
- * discerning small from large variances.
- */
- public static final OptionID ALPHA_ID = new OptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances.");
-
- /**
- * Parameter for the similarity function.
- */
- public static final OptionID SIM_ID = new OptionID("sod.similarity", "The similarity function used for the neighborhood set.");
-
- /**
- * Holds the value of {@link #KNN_ID}.
+ * Neighborhood size.
*/
private int knn;
/**
- * Holds the value of {@link #ALPHA_ID}.
+ * Alpha (discriminance value).
*/
private double alpha;
/**
- * The similarity function {@link #SIM_ID}.
+ * Similarity function to use.
*/
private SimilarityFunction<V, D> similarityFunction;
/**
+ * Report models.
+ */
+ private boolean models;
+
+ /**
* Constructor with parameters.
*
* @param knn knn value
* @param alpha Alpha parameter
* @param similarityFunction Shared nearest neighbor similarity function
+ * @param models Report generated models
*/
- public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction) {
+ public SOD(int knn, double alpha, SimilarityFunction<V, D> similarityFunction, boolean models) {
super();
this.knn = knn;
this.alpha = alpha;
this.similarityFunction = similarityFunction;
+ this.models = models;
}
/**
@@ -157,26 +147,55 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
public OutlierResult run(Relation<V> relation) {
SimilarityQuery<V, D> snnInstance = similarityFunction.instantiate(relation);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Assigning Subspace Outlier Degree", relation.size(), LOG) : null;
- WritableDataStore<SODModel<?>> sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class);
+ final WritableDoubleDataStore sod_scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
+ WritableDataStore<SODModel> sod_models = null;
+ if (models) { // Models requested
+ sod_models = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, SODModel.class);
+ }
DoubleMinMax minmax = new DoubleMinMax();
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
if (progress != null) {
progress.incrementProcessed(LOG);
}
- DBIDs knnList = getNearestNeighbors(relation, snnInstance, iter);
- SODModel<V> model = new SODModel<>(relation, knnList, alpha, relation.get(iter));
- sod_models.put(iter, model);
- minmax.put(model.getSod());
+ DBIDs neighborhood = getNearestNeighbors(relation, snnInstance, iter);
+
+ Vector center;
+ BitSet weightVector;
+ double sod;
+ if (neighborhood.size() > 0) {
+ center = Centroid.make(relation, neighborhood);
+ // Note: per-dimension variances; no covariances.
+ double[] variances = computePerDimensionVariances(relation, center, neighborhood);
+ double expectationOfVariance = Mean.of(variances);
+ weightVector = new BitSet(variances.length);
+ for (int d = 0; d < variances.length; d++) {
+ if (variances[d] < alpha * expectationOfVariance) {
+ weightVector.set(d, true);
+ }
+ }
+ sod = subspaceOutlierDegree(relation.get(iter), center, weightVector);
+ } else {
+ center = relation.get(iter).getColumnVector();
+ weightVector = null;
+ sod = 0.;
+ }
+
+ if (sod_models != null) {
+ sod_models.put(iter, new SODModel(center, weightVector));
+ }
+ sod_scores.putDouble(iter, sod);
+ minmax.put(sod);
}
if (progress != null) {
progress.ensureCompleted(LOG);
}
// combine results.
- Relation<SODModel<?>> models = new MaterializedRelation<>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation<SODModel<?>>(SODModel.class), sod_models, relation.getDBIDs());
OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
- OutlierResult sodResult = new OutlierResult(meta, new SODProxyScoreResult(models, relation.getDBIDs()));
- // also add the models.
- sodResult.addChildResult(models);
+ OutlierResult sodResult = new OutlierResult(meta, new MaterializedRelation<>("Subspace Outlier Degree", "sod-outlier", TypeUtil.DOUBLE, sod_scores, relation.getDBIDs()));
+ if (sod_models != null) {
+ Relation<SODModel> models = new MaterializedRelation<>("Subspace Outlier Model", "sod-outlier", new SimpleTypeInformation<>(SODModel.class), sod_models, relation.getDBIDs());
+ sodResult.addChildResult(models);
+ }
return sodResult;
}
@@ -186,6 +205,8 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* <p/>
* The query object is excluded from the knn list.
*
+ * FIXME: move this to the database layer.
+ *
* @param relation the database holding the objects
* @param simQ similarity function
* @param queryObject the query object for which the kNNs should be determined
@@ -193,14 +214,14 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* distance without the query object
*/
private DBIDs getNearestNeighbors(Relation<V> relation, SimilarityQuery<V, D> simQ, DBIDRef queryObject) {
- // similarityFunction.getPreprocessor().getParameters();
Heap<DoubleDBIDPair> nearestNeighbors = new TiedTopBoundedHeap<>(knn);
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
- if (!DBIDUtil.equal(iter, queryObject)) {
- double sim = simQ.similarity(queryObject, iter).doubleValue();
- if (sim > 0) {
- nearestNeighbors.add(DBIDUtil.newPair(sim, iter));
- }
+ if (DBIDUtil.equal(iter, queryObject)) {
+ continue;
+ }
+ double sim = simQ.similarity(queryObject, iter).doubleValue();
+ if (sim > 0.) {
+ nearestNeighbors.add(DBIDUtil.newPair(sim, iter));
}
}
// Collect DBIDs
@@ -211,6 +232,50 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
return dbids;
}
+ /**
+ * Compute the per-dimension variances for the given neighborhood and center.
+ *
+ * @param relation Data relation
+ * @param center Center vector
+ * @param neighborhood Neighbors
+ * @return Per-dimension variances.
+ */
+ private static double[] computePerDimensionVariances(Relation<? extends NumberVector<?>> relation, Vector center, DBIDs neighborhood) {
+ double[] c = center.getArrayRef();
+ double[] variances = new double[c.length];
+ for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
+ NumberVector<?> databaseObject = relation.get(iter);
+ for (int d = 0; d < c.length; d++) {
+ final double deviation = databaseObject.doubleValue(d) - c[d];
+ variances[d] += deviation * deviation;
+ }
+ }
+ for (int d = 0; d < variances.length; d++) {
+ variances[d] /= neighborhood.size();
+ }
+ return variances;
+ }
+
+ /**
+ * Compute SOD score.
+ *
+ * @param queryObject Query object
+ * @param center Center vector
+ * @param weightVector Weight vector
+ * @return sod score
+ */
+ private double subspaceOutlierDegree(V queryObject, Vector center, BitSet weightVector) {
+ final int card = weightVector.cardinality();
+ if (card == 0) {
+ return 0;
+ }
+ final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector);
+ double distance = df.distance(queryObject, center).doubleValue();
+ distance /= card; // FIXME: defined as card, should be sqrt(card),
+ // unfortunately
+ return distance;
+ }
+
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
@@ -225,232 +290,89 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
* SOD Model class
*
* @author Arthur Zimek
- * @param <V> the type of DatabaseObjects handled by this Result
*/
- // TODO: arthur comment
- public static class SODModel<V extends NumberVector<?>> implements TextWriteable, Comparable<SODModel<?>> {
- private double[] centerValues;
-
- private V center;
-
- private double[] variances;
-
- private double expectationOfVariance;
-
- private BitSet weightVector;
-
- private double sod;
-
+ public static class SODModel implements TextWriteable {
/**
- * Initialize SOD Model
- *
- * @param relation Database
- * @param neighborhood Neighborhood
- * @param alpha Alpha value
- * @param queryObject Query object
+ * Center vector
*/
- public SODModel(Relation<V> relation, DBIDs neighborhood, double alpha, V queryObject) {
- if (neighborhood.size() > 0) {
- // TODO: store database link?
- centerValues = new double[RelationUtil.dimensionality(relation)];
- variances = new double[centerValues.length];
- for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
- V databaseObject = relation.get(iter);
- for (int d = 0; d < centerValues.length; d++) {
- centerValues[d] += databaseObject.doubleValue(d);
- }
- }
- for (int d = 0; d < centerValues.length; d++) {
- centerValues[d] /= neighborhood.size();
- }
- for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
- V databaseObject = relation.get(iter);
- for (int d = 0; d < centerValues.length; d++) {
- // distance
- double distance = centerValues[d] - databaseObject.doubleValue(d);
- // variance
- variances[d] += distance * distance;
- }
- }
- expectationOfVariance = 0;
- for (int d = 0; d < variances.length; d++) {
- variances[d] /= neighborhood.size();
- expectationOfVariance += variances[d];
- }
- expectationOfVariance /= variances.length;
- weightVector = new BitSet(variances.length);
- for (int d = 0; d < variances.length; d++) {
- if (variances[d] < alpha * expectationOfVariance) {
- weightVector.set(d, true);
- }
- }
- center = RelationUtil.getNumberVectorFactory(relation).newNumberVector(centerValues);
- sod = subspaceOutlierDegree(queryObject, center, weightVector);
- } else {
- center = queryObject;
- sod = 0.0;
- }
- }
+ private Vector center;
/**
- * Compute SOD score.
- *
- * @param queryObject Query object
- * @param center Center vector
- * @param weightVector Weight vector
- * @return sod score
+ * Relevant dimensions.
*/
- private double subspaceOutlierDegree(V queryObject, V center, BitSet weightVector) {
- final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(weightVector);
- final int card = weightVector.cardinality();
- if (card == 0) {
- return 0;
- }
- double distance = df.distance(queryObject, center).doubleValue();
- distance /= card;
- return distance;
- }
+ private BitSet weightVector;
/**
- * Return the SOD of the point.
+ * Initialize SOD Model
*
- * @return sod value
+ * @param center Center vector
+ * @param weightVector Selected dimensions
*/
- public double getSod() {
- return this.sod;
+ public SODModel(Vector center, BitSet weightVector) {
+ this.center = center;
+ this.weightVector = weightVector;
}
@Override
public void writeToText(TextWriterStream out, String label) {
- out.inlinePrint(label + "=" + this.sod);
out.commentPrintLn(this.getClass().getSimpleName() + ":");
out.commentPrintLn("relevant attributes (counting starts with 0): " + this.weightVector.toString());
out.commentPrintLn("center of neighborhood: " + out.normalizationRestore(center).toString());
- out.commentPrintLn("subspace outlier degree: " + this.sod);
out.commentPrintSeparator();
}
-
- @Override
- public int compareTo(SODModel<?> o) {
- return Double.compare(this.getSod(), o.getSod());
- }
-
}
/**
- * Proxy class that converts a model result to an actual SOD score result.
+ * Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
- protected static class SODProxyScoreResult implements Relation<Double> {
+ public static class Parameterizer<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
/**
- * Model result this is a proxy for.
+ * Parameter to specify the number of shared nearest neighbors to be
+ * considered for learning the subspace properties., must be an integer
+ * greater than 0.
*/
- Relation<SODModel<?>> models;
+ public static final OptionID KNN_ID = new OptionID("sod.knn", "The number of most snn-similar objects to use as reference set for learning the subspace properties.");
/**
- * The IDs we are defined for.
+ * Parameter to indicate the multiplier for the discriminance value for
+ * discerning small from large variances.
*/
- DBIDs dbids;
+ public static final OptionID ALPHA_ID = new OptionID("sod.alpha", "The multiplier for the discriminance value for discerning small from large variances.");
/**
- * Constructor.
- *
- * @param models Models result
- * @param dbids IDs we are defined for
+ * Parameter for the similarity function.
*/
- public SODProxyScoreResult(Relation<SODModel<?>> models, DBIDs dbids) {
- super();
- this.models = models;
- this.dbids = dbids;
- }
-
- @Override
- public Double get(DBIDRef objID) {
- return models.get(objID).getSod();
- }
-
- @Override
- public String getLongName() {
- return "Subspace Outlier Degree";
- }
-
- @Override
- public String getShortName() {
- return "sod-outlier";
- }
-
- @Override
- public DBIDs getDBIDs() {
- return dbids;
- }
-
- @Override
- public DBIDIter iterDBIDs() {
- return dbids.iter();
- }
-
- @Override
- public Database getDatabase() {
- return null; // FIXME
- }
+ public static final OptionID SIM_ID = new OptionID("sod.similarity", "The similarity function used for the neighborhood set.");
- @Override
- public void set(DBIDRef id, Double val) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void delete(DBIDRef id) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public SimpleTypeInformation<Double> getDataTypeInformation() {
- return TypeUtil.DOUBLE;
- }
-
- @Override
- public int size() {
- return dbids.size();
- }
-
- @Override
- public ResultHierarchy getHierarchy() {
- return models.getHierarchy();
- }
-
- @Override
- public void setHierarchy(ResultHierarchy hierarchy) {
- models.setHierarchy(hierarchy);
- }
- }
+ /**
+ * Parameter for keeping the models.
+ */
+ public static final OptionID MODELS_ID = new OptionID("sod.models", "Report the models computed by SOD (default: report only scores).");
- /**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
- public static class Parameterizer<V extends NumberVector<?>, D extends NumberDistance<D, ?>> extends AbstractParameterizer {
/**
- * Holds the value of {@link #KNN_ID}.
+ * Neighborhood size
*/
private int knn = 1;
/**
- * Holds the value of {@link #ALPHA_ID}.
+ * Alpha (discriminance value).
*/
private double alpha = 1.1;
/**
- * The similarity function - {@link #SIM_ID}.
+ * The similarity function.
*/
private SimilarityFunction<V, D> similarityFunction;
+ /**
+ * Track models.
+ */
+ private boolean models = false;
+
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
@@ -460,21 +382,26 @@ public class SOD<V extends NumberVector<?>, D extends NumberDistance<D, ?>> exte
}
final IntParameter knnP = new IntParameter(KNN_ID);
- knnP.addConstraint(new GreaterConstraint(0));
+ knnP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if (config.grab(knnP)) {
knn = knnP.getValue();
}
final DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.1);
- alphaP.addConstraint(new GreaterConstraint(0));
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
if (config.grab(alphaP)) {
alpha = alphaP.doubleValue();
}
+
+ final Flag modelsF = new Flag(MODELS_ID);
+ if (config.grab(modelsF)) {
+ models = modelsF.isTrue();
+ }
}
@Override
protected SOD<V, D> makeInstance() {
- return new SOD<>(knn, alpha, similarityFunction);
+ return new SOD<>(knn, alpha, similarityFunction, models);
}
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java
index 84e3ad41..6f2f2f38 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialAverageCoordinateOutlier.java
@@ -80,13 +80,13 @@ public class TrivialAverageCoordinateOutlier extends AbstractAlgorithm<OutlierRe
m.reset();
NumberVector<?> nv = relation.get(iditer);
for (int i = 0; i < nv.getDimensionality(); i++) {
- m.put(nv.doubleValue(i + 1));
+ m.put(nv.doubleValue(i));
}
final double score = m.getMean();
scores.putDouble(iditer, score);
minmax.put(score);
}
- Relation<Double> scoreres = new MaterializedRelation<Double>("Trivial mean score", "mean-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs());
+ Relation<Double> scoreres = new MaterializedRelation<>("Trivial mean score", "mean-outlier", TypeUtil.DOUBLE, scores, relation.getDBIDs());
OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax());
return new OutlierResult(meta, scoreres);
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java
index 285b00df..2e952b5f 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/outlier/trivial/TrivialGeneratedOutlier.java
@@ -52,8 +52,7 @@ import de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
@@ -106,7 +105,8 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> im
try {
Relation<?> relation = database.getRelation(TypeUtil.CLASSLABEL);
return run(models, vecs, relation);
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// Otherwise, try any labellike.
return run(models, vecs, database.getRelation(TypeUtil.GUESSED_LABEL));
}
@@ -124,56 +124,58 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> im
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(models.getDBIDs(), DataStoreFactory.HINT_HOT);
HashSet<GeneratorSingleCluster> generators = new HashSet<>();
- for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
Model model = models.get(iditer);
- if (model instanceof GeneratorSingleCluster) {
+ if(model instanceof GeneratorSingleCluster) {
generators.add((GeneratorSingleCluster) model);
}
}
- if (generators.size() == 0) {
+ if(generators.size() == 0) {
LOG.warning("No generator models found for dataset - all points will be considered outliers.");
}
- for (GeneratorSingleCluster gen : generators) {
- for (int i = 0; i < gen.getDim(); i++) {
+ for(GeneratorSingleCluster gen : generators) {
+ for(int i = 0; i < gen.getDim(); i++) {
Distribution dist = gen.getDistribution(i);
- if (!(dist instanceof NormalDistribution)) {
+ if(!(dist instanceof NormalDistribution)) {
throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist);
}
}
}
- for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = 1.;
// Convert to a math vector
Vector v = vecs.get(iditer).getColumnVector();
- for (GeneratorSingleCluster gen : generators) {
+ for(GeneratorSingleCluster gen : generators) {
Vector tv = v;
// Transform backwards
- if (gen.getTransformation() != null) {
+ if(gen.getTransformation() != null) {
tv = gen.getTransformation().applyInverse(v);
}
final int dim = tv.getDimensionality();
double lensq = 0.0;
int norm = 0;
- for (int i = 0; i < dim; i++) {
+ for(int i = 0; i < dim; i++) {
Distribution dist = gen.getDistribution(i);
- if (dist instanceof NormalDistribution) {
+ if(dist instanceof NormalDistribution) {
NormalDistribution d = (NormalDistribution) dist;
double delta = (tv.get(i) - d.getMean()) / d.getStddev();
lensq += delta * delta;
norm += 1;
- } else {
+ }
+ else {
throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist);
}
}
- if (norm > 0.) {
+ if(norm > 0.) {
// The squared distances are ChiSquared distributed
score = Math.min(score, ChiSquaredDistribution.cdf(lensq, norm));
- } else {
+ }
+ else {
score = 0.;
}
}
- if (expect < 1) {
+ if(expect < 1) {
score = expect * score / (1 - score + expect);
}
scores.putDouble(iditer, score);
@@ -210,9 +212,9 @@ public class TrivialGeneratedOutlier extends AbstractAlgorithm<OutlierResult> im
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DoubleParameter expectP = new DoubleParameter(EXPECT_ID, 0.01);
- expectP.addConstraint(new GreaterConstraint(0.0));
- expectP.addConstraint(new LessEqualConstraint(1.0));
- if (config.grab(expectP)) {
+ expectP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ expectP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
+ if(config.grab(expectP)) {
expect = expectP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java
index cbae17ca..8bd5f057 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AddSingleScale.java
@@ -94,7 +94,11 @@ public class AddSingleScale implements Algorithm {
for(DBIDIter iditer = rel.iterDBIDs(); iditer.valid(); iditer.advance()) {
NumberVector<?> vec = rel.get(iditer);
for(int d = 0; d < dim; d++) {
- mm.put(vec.doubleValue(d));
+ final double val = vec.doubleValue(d);
+ if(val != val) {
+ continue; // NaN
+ }
+ mm.put(val);
}
}
LinearScale scale = new LinearScale(mm.getMin(), mm.getMax());
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java
index 1b87a015..490f8ba6 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/AveragePrecisionAtK.java
@@ -47,9 +47,7 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.result.CollectionResult;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
@@ -124,34 +122,36 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ?
MeanVariance[] mvs = MeanVariance.newArray(k);
final DBIDs ids;
- if (sampling < 1.0) {
+ if(sampling < 1.0) {
int size = Math.max(1, (int) (sampling * relation.size()));
ids = DBIDUtil.randomSample(relation.getDBIDs(), size, seed);
- } else {
+ }
+ else {
ids = relation.getDBIDs();
}
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Processing points...");
}
FiniteProgress objloop = LOG.isVerbose() ? new FiniteProgress("Computing nearest neighbors", ids.size(), LOG) : null;
// sort neighbors
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
KNNList<D> knn = knnQuery.getKNNForDBID(iter, qk);
Object label = lrelation.get(iter);
int positive = 0, i = 0;
- for (DBIDIter ri = knn.iter(); i < k && ri.valid(); ri.advance()) {
- if (!includeSelf && DBIDUtil.equal(iter, ri)) {
+ for(DBIDIter ri = knn.iter(); i < k && ri.valid(); ri.advance()) {
+ if(!includeSelf && DBIDUtil.equal(iter, ri)) {
continue;
}
Object olabel = lrelation.get(ri);
- if (label == null) {
- if (olabel == null) {
+ if(label == null) {
+ if(olabel == null) {
positive += 1;
}
- } else {
- if (label.equals(olabel)) {
+ }
+ else {
+ if(label.equals(olabel)) {
positive += 1;
}
}
@@ -159,18 +159,18 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ?
mvs[i].put(precision);
i++;
}
- if (objloop != null) {
+ if(objloop != null) {
objloop.incrementProcessed(LOG);
}
}
- if (objloop != null) {
+ if(objloop != null) {
objloop.ensureCompleted(LOG);
}
// Collections.sort(results);
// Transform Histogram into a Double Vector array.
Collection<DoubleVector> res = new ArrayList<>(k);
- for (int i = 0; i < k; i++) {
+ for(int i = 0; i < k; i++) {
DoubleVector row = new DoubleVector(new double[] { mvs[i].getMean(), mvs[i].getSampleStddev() });
res.add(row);
}
@@ -239,24 +239,24 @@ public class AveragePrecisionAtK<V extends Object, D extends NumberDistance<D, ?
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kP = new IntParameter(K_ID);
- kP.addConstraint(new GreaterEqualConstraint(2));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue();
}
final DoubleParameter samplingP = new DoubleParameter(SAMPLING_ID);
- samplingP.addConstraint(new GreaterConstraint(0.0));
- samplingP.addConstraint(new LessEqualConstraint(1.0));
+ samplingP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ samplingP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE);
samplingP.setOptional(true);
- if (config.grab(samplingP)) {
+ if(config.grab(samplingP)) {
sampling = samplingP.getValue();
}
final LongParameter rndP = new LongParameter(SEED_ID);
rndP.setOptional(true);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
seed = rndP.getValue();
}
final Flag includeP = new Flag(INCLUDESELF_ID);
- if (config.grab(includeP)) {
+ if(config.grab(includeP)) {
includeSelf = includeP.isTrue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java
index 3c8e1635..244af0ca 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java
@@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.OnlyOneIsAllowedToBeSetGlobalConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
@@ -159,24 +159,26 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
MeanVariance modif = new MeanVariance();
// Histogram
final ObjHistogram<long[]> histogram;
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(1, "Prepare histogram.", LOG);
}
- if (exact) {
+ if(exact) {
gminmax = exactMinMax(relation, distFunc);
histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
- } else if (sampling) {
+ }
+ else if(sampling) {
gminmax = sampleMinMax(relation, distFunc);
histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
- } else {
+ }
+ else {
histogram = new AbstractObjDynamicHistogram<long[]>(numbin) {
@Override
protected long[] downsample(Object[] data, int start, int end, int size) {
long[] ret = new long[2];
- for (int i = start; i < end; i++) {
+ for(int i = start; i < end; i++) {
long[] existing = (long[]) data[i];
- if (existing != null) {
- for (int c = 0; c < 2; c++) {
+ if(existing != null) {
+ for(int c = 0; c < 2; c++) {
ret[c] += existing[c];
}
}
@@ -186,7 +188,7 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
@Override
protected long[] aggregate(long[] first, long[] second) {
- for (int c = 0; c < 2; c++) {
+ for(int c = 0; c < 2; c++) {
first[c] += second[c];
}
return first;
@@ -204,20 +206,20 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
};
}
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.beginStep(2, "Build histogram.", LOG);
}
final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null;
// iterate per cluster
final long[] incFirst = new long[] { 1L, 0L };
final long[] incSecond = new long[] { 0L, 1L };
- for (Cluster<?> c1 : split) {
- for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
+ for(Cluster<?> c1 : split) {
+ for(DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
// in-cluster distances
DoubleMinMax iminmax = new DoubleMinMax();
- for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
// skip the point itself.
- if (DBIDUtil.equal(id1, iter2)) {
+ if(DBIDUtil.equal(id1, iter2)) {
continue;
}
double d = distFunc.distance(id1, iter2).doubleValue();
@@ -236,13 +238,13 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
// other-cluster distances
DoubleMinMax ominmax = new DoubleMinMax();
- for (Cluster<?> c2 : split) {
- if (c2 == c1) {
+ for(Cluster<?> c2 : split) {
+ if(c2 == c1) {
continue;
}
- for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
// skip the point itself (shouldn't happen though)
- if (DBIDUtil.equal(id1, iter2)) {
+ if(DBIDUtil.equal(id1, iter2)) {
continue;
}
double d = distFunc.distance(id1, iter2).doubleValue();
@@ -259,33 +261,33 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
// min/max
gominmax.put(ominmax.getMin());
gominmax.put(ominmax.getMax());
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
// Update values (only needed for sampling case).
gminmax.setFirst(Math.min(giminmax.getMin(), gominmax.getMin()));
gminmax.setSecond(Math.max(giminmax.getMax(), gominmax.getMax()));
- if (stepprog != null) {
+ if(stepprog != null) {
stepprog.setCompleted(LOG);
}
// count the number of samples we have in the data
long inum = 0;
long onum = 0;
- for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
+ for(ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
inum += iter.getValue()[0];
onum += iter.getValue()[1];
}
long bnum = inum + onum;
Collection<DoubleVector> binstat = new ArrayList<>(numbin);
- for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
+ for(ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
final long[] value = iter.getValue();
final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize();
final double icaf = ((double) value[0]) / bnum / histogram.getBinsize();
@@ -327,26 +329,26 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
ArrayModifiableDBIDs randomset = DBIDUtil.newArray(randomsize);
DBIDIter iter = relation.iterDBIDs();
- if (!iter.valid()) {
+ if(!iter.valid()) {
throw new IllegalStateException(ExceptionMessages.DATABASE_EMPTY);
}
DBID firstid = DBIDUtil.deref(iter);
iter.advance();
minhotset.add(DBIDUtil.newPair(Double.MAX_VALUE, firstid));
maxhotset.add(DBIDUtil.newPair(Double.MIN_VALUE, firstid));
- for (; iter.valid(); iter.advance()) {
+ for(; iter.valid(); iter.advance()) {
// generate candidates for min distance.
ArrayList<DoubleDBIDPair> np = new ArrayList<>(k * 2 + randomsize * 2);
- for (DoubleDBIDPair pair : minhotset) {
+ for(DoubleDBIDPair pair : minhotset) {
// skip the object itself
- if (DBIDUtil.equal(iter, pair)) {
+ if(DBIDUtil.equal(iter, pair)) {
continue;
}
double d = distFunc.distance(iter, pair).doubleValue();
np.add(DBIDUtil.newPair(d, iter));
np.add(DBIDUtil.newPair(d, pair));
}
- for (DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) {
double d = distFunc.distance(iter, iter2).doubleValue();
np.add(DBIDUtil.newPair(d, iter));
np.add(DBIDUtil.newPair(d, iter2));
@@ -356,16 +358,16 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
// generate candidates for max distance.
ArrayList<DoubleDBIDPair> np2 = new ArrayList<>(k * 2 + randomsize * 2);
- for (DoubleDBIDPair pair : minhotset) {
+ for(DoubleDBIDPair pair : minhotset) {
// skip the object itself
- if (DBIDUtil.equal(iter, pair)) {
+ if(DBIDUtil.equal(iter, pair)) {
continue;
}
double d = distFunc.distance(iter, pair).doubleValue();
np2.add(DBIDUtil.newPair(d, iter));
np2.add(DBIDUtil.newPair(d, pair));
}
- for (DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) {
+ for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) {
double d = distFunc.distance(iter, iter2).doubleValue();
np.add(DBIDUtil.newPair(d, iter));
np.add(DBIDUtil.newPair(d, iter2));
@@ -374,9 +376,10 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
shrinkHeap(maxhotset, k);
// update random set
- if (randomset.size() < randomsize) {
+ if(randomset.size() < randomsize) {
randomset.add(iter);
- } else if (rnd.nextDouble() < rprob) {
+ }
+ else if(rnd.nextDouble() < rprob) {
randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), iter);
}
}
@@ -393,10 +396,10 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
private DoubleMinMax exactMinMax(Relation<O> relation, DistanceQuery<O, D> distFunc) {
DoubleMinMax minmax = new DoubleMinMax();
// find exact minimum and maximum first.
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- for (DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ for(DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) {
// skip the point itself.
- if (DBIDUtil.equal(iditer, iditer2)) {
+ if(DBIDUtil.equal(iditer, iditer2)) {
continue;
}
double d = distFunc.distance(iditer, iditer2).doubleValue();
@@ -416,11 +419,12 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
// drop duplicates
ModifiableDBIDs seenids = DBIDUtil.newHashSet(2 * k);
int cnt = 0;
- for (Iterator<DoubleDBIDPair> i = hotset.iterator(); i.hasNext();) {
+ for(Iterator<DoubleDBIDPair> i = hotset.iterator(); i.hasNext();) {
DoubleDBIDPair p = i.next();
- if (cnt > k || seenids.contains(p)) {
+ if(cnt > k || seenids.contains(p)) {
i.remove();
- } else {
+ }
+ else {
seenids.add(p);
cnt++;
}
@@ -464,18 +468,18 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter numbinP = new IntParameter(HISTOGRAM_BINS_ID, 20);
- numbinP.addConstraint(new GreaterEqualConstraint(2));
- if (config.grab(numbinP)) {
+ numbinP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(numbinP)) {
numbin = numbinP.getValue();
}
final Flag exactF = new Flag(EXACT_ID);
- if (config.grab(exactF)) {
+ if(config.grab(exactF)) {
exact = exactF.getValue();
}
final Flag samplingF = new Flag(SAMPLING_ID);
- if (config.grab(samplingF)) {
+ if(config.grab(samplingF)) {
sampling = samplingF.getValue();
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java
index 76e5ef66..d5d8e407 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/EvaluateRankingQuality.java
@@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -121,7 +121,7 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD
final DistanceQuery<V, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
final KNNQuery<V, D> knnQuery = database.getKNNQuery(distQuery, relation.size());
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Preprocessing clusters...");
}
// Cluster by labels
@@ -130,7 +130,7 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD
// Compute cluster averages and covariance matrix
HashMap<Cluster<?>, Vector> averages = new HashMap<>(split.size());
HashMap<Cluster<?>, Matrix> covmats = new HashMap<>(split.size());
- for (Cluster<?> clus : split) {
+ for(Cluster<?> clus : split) {
CovarianceMatrix covmat = CovarianceMatrix.make(relation, clus.getIDs());
averages.put(clus, covmat.getMeanVector());
covmats.put(clus, covmat.destroyToNaiveMatrix());
@@ -138,42 +138,42 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD
MeanVarianceStaticHistogram hist = new MeanVarianceStaticHistogram(numbins, 0.0, 1.0);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Processing points...");
}
FiniteProgress rocloop = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null;
// sort neighbors
- for (Cluster<?> clus : split) {
+ for(Cluster<?> clus : split) {
ArrayList<DoubleDBIDPair> cmem = new ArrayList<>(clus.size());
Vector av = averages.get(clus);
Matrix covm = covmats.get(clus);
- for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
+ for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
double d = MathUtil.mahalanobisDistance(covm, relation.get(iter).getColumnVector().minusEquals(av));
cmem.add(DBIDUtil.newPair(d, iter));
}
Collections.sort(cmem);
- for (int ind = 0; ind < cmem.size(); ind++) {
+ for(int ind = 0; ind < cmem.size(); ind++) {
KNNList<D> knn = knnQuery.getKNNForDBID(cmem.get(ind), relation.size());
double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn);
hist.put(((double) ind) / clus.size(), result);
- if (rocloop != null) {
+ if(rocloop != null) {
rocloop.incrementProcessed(LOG);
}
}
}
- if (rocloop != null) {
+ if(rocloop != null) {
rocloop.ensureCompleted(LOG);
}
// Collections.sort(results);
// Transform Histogram into a Double Vector array.
Collection<DoubleVector> res = new ArrayList<>(relation.size());
- for (ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) {
+ for(ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) {
DoubleVector row = new DoubleVector(new double[] { iter.getCenter(), iter.getValue().getCount(), iter.getValue().getMean(), iter.getValue().getSampleVariance() });
res.add(row);
}
@@ -207,8 +207,8 @@ public class EvaluateRankingQuality<V extends NumberVector<?>, D extends NumberD
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter param = new IntParameter(HISTOGRAM_BINS_ID, 20);
- param.addConstraint(new GreaterEqualConstraint(2));
- if (config.grab(param)) {
+ param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(param)) {
numbins = param.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java
index 58018029..7d0f1bb2 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/RankingQualityHistogram.java
@@ -51,7 +51,7 @@ import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.DoubleStaticHistog
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -109,7 +109,7 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends
final DistanceQuery<O, D> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction());
final KNNQuery<O, D> knnQuery = database.getKNNQuery(distanceQuery, relation.size());
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Preprocessing clusters...");
}
// Cluster by labels
@@ -117,33 +117,33 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends
DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0);
- if (LOG.isVerbose()) {
+ if(LOG.isVerbose()) {
LOG.verbose("Processing points...");
}
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null;
MeanVariance mv = new MeanVariance();
// sort neighbors
- for (Cluster<?> clus : split) {
- for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
+ for(Cluster<?> clus : split) {
+ for(DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
KNNList<D> knn = knnQuery.getKNNForDBID(iter, relation.size());
double result = ROC.computeROCAUCDistanceResult(relation.size(), clus, knn);
mv.put(result);
hist.increment(result, 1. / relation.size());
- if (progress != null) {
+ if(progress != null) {
progress.incrementProcessed(LOG);
}
}
}
- if (progress != null) {
+ if(progress != null) {
progress.ensureCompleted(LOG);
}
// Transform Histogram into a Double Vector array.
Collection<DoubleVector> res = new ArrayList<>(relation.size());
- for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
+ for(DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
DoubleVector row = new DoubleVector(new double[] { iter.getCenter(), iter.getValue() });
res.add(row);
}
@@ -179,8 +179,8 @@ public class RankingQualityHistogram<O, D extends NumberDistance<D, ?>> extends
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter param = new IntParameter(HISTOGRAM_BINS_ID, 100);
- param.addConstraint(new GreaterEqualConstraint(2));
- if (config.grab(param)) {
+ param.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(param)) {
numbins = param.getValue();
}
}